In [1]:
import pandas as pd
import numpy as np

import sklearn 
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df = pd.read_csv('./data/preprocessed_tweets_with_race&age.csv')
df

Unnamed: 0.1,Unnamed: 0,user_id,all_tweets,is_female,year_born,race
0,0,12488,ykar futuristic sans serif font who can contac...,0.0,1980.0,4.0
1,1,719703,other words good news about the vaccine safety...,0.0,1985.0,4.0
2,2,749003,would fair call lil nas the first successful o...,0.0,1982.0,5.0
3,3,822540,bonk nice mcboy oos getting real tired games...,0.0,1979.0,4.0
4,4,865071,how about pizza dipped water day quarantine in...,0.0,1995.0,4.0
...,...,...,...,...,...,...
3266,3270,3196361888,back someone called hungry who back watch sock...,1.0,1995.0,1.0
3267,3272,3352812676,women guide burn fat and build muscle the holy...,1.0,1973.0,4.0
3268,3273,3924536853,even though school cancelled and grades don ma...,1.0,1993.0,4.0
3269,3274,4281628276,and what are you drunk hillary supporter women...,0.0,1987.0,4.0


In [3]:
## stem 
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

In [4]:
all_stem_wrds = []
for txt in df['all_tweets']:
    wrds = txt.split()
    stem_wrds = []
    for i in wrds:
        stem_wrds.append(stemmer.stem(i))
    
    str1 = ' '.join(stem_wrds)
    all_stem_wrds.append(str1)

df['stemmed_tweets'] = all_stem_wrds

In [5]:
## drop NaN
df = df.dropna(subset=['race'])

In [6]:
vectorizer = TfidfVectorizer(stop_words='english', max_features = 5000)

X = vectorizer.fit_transform(df['stemmed_tweets'])
print(X.shape)

(3241, 5000)


In [7]:
df['race'].value_counts()

4.0    2534
1.0     299
2.0     186
5.0     120
3.0     102
Name: race, dtype: int64

4 is white, 1 is black, and merge all other race to 0

In [8]:
df.loc[:, 'race'] = np.where(
   (df['race'] != 4) & (df['race'] != 1) , 0.0, df['race']
   )

print(df['race'].value_counts())
y = df['race']

4.0    2534
0.0     408
1.0     299
Name: race, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


### logistic regression + TFIDF vectorizor & cross validation

In [9]:
from sklearn import metrics, preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

In [10]:
from sklearn.metrics import classification_report, accuracy_score, make_scorer, confusion_matrix

def classification_report_with_accuracy_score(y_true, y_pred):
    print(classification_report(y_true, y_pred)) # print classification report
    print(confusion_matrix(y_true, y_pred, labels = [0,1,4]))
    return accuracy_score(y_true, y_pred) # return accuracy score

#### classification report for race

In [11]:
clf = make_pipeline(preprocessing.StandardScaler(with_mean=False), LogisticRegression(max_iter=500))
scores = cross_val_score(clf, X, y, cv=5, \
               scoring=make_scorer(classification_report_with_accuracy_score))

              precision    recall  f1-score   support

         0.0       0.23      0.12      0.16        82
         1.0       0.45      0.28      0.35        60
         4.0       0.82      0.92      0.87       507

    accuracy                           0.76       649
   macro avg       0.50      0.44      0.46       649
weighted avg       0.71      0.76      0.73       649

[[ 10   8  64]
 [  6  17  37]
 [ 28  13 466]]
              precision    recall  f1-score   support

         0.0       0.24      0.16      0.19        82
         1.0       0.38      0.29      0.33        59
         4.0       0.83      0.89      0.86       507

    accuracy                           0.75       648
   macro avg       0.48      0.45      0.46       648
weighted avg       0.71      0.75      0.73       648

[[ 13   8  61]
 [  8  17  34]
 [ 34  20 453]]
              precision    recall  f1-score   support

         0.0       0.15      0.07      0.10        81
         1.0       0.42      0.33    

#### classification report for age

In [12]:
df_age = pd.read_csv('./data/preprocessed_tweets_with_for_age_pred.csv',  lineterminator='\n')

In [13]:
all_stem_wrds = []
for txt in df_age['all_tweets']:
    wrds = txt.split()
    stem_wrds = []
    for i in wrds:
        stem_wrds.append(stemmer.stem(i))
    
    str1 = ' '.join(stem_wrds)
    all_stem_wrds.append(str1)

df_age['stemmed_tweets'] = all_stem_wrds

In [14]:
y_age = df_age['human.labeled.age']
df_age['human.labeled.age'].value_counts()

1    718
0    427
Name: human.labeled.age, dtype: int64

In [15]:
X_age = vectorizer.fit_transform(df_age['stemmed_tweets'])
print(X_age.shape)

(1145, 5000)


In [16]:
scores = cross_val_score(clf, X_age, y_age, cv=5, \
               scoring=make_scorer(classification_report_with_accuracy_score))

              precision    recall  f1-score   support

           0       0.61      0.49      0.55        85
           1       0.73      0.81      0.77       144

    accuracy                           0.69       229
   macro avg       0.67      0.65      0.66       229
weighted avg       0.69      0.69      0.69       229

[[ 42  43   0]
 [ 27 117   0]
 [  0   0   0]]
              precision    recall  f1-score   support

           0       0.60      0.40      0.48        85
           1       0.70      0.84      0.77       144

    accuracy                           0.68       229
   macro avg       0.65      0.62      0.62       229
weighted avg       0.66      0.68      0.66       229

[[ 34  51   0]
 [ 23 121   0]
 [  0   0   0]]
              precision    recall  f1-score   support

           0       0.66      0.41      0.51        85
           1       0.72      0.88      0.79       144

    accuracy                           0.70       229
   macro avg       0.69      0.64   

### word2vec + Logistic Regression

In [17]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [18]:
model = Word2Vec(sentences=df['all_tweets'], vector_size=500, min_count=5)    
# Store just the words + their trained embeddings.
word_vectors = model.wv
word_vectors.save("word2vec.wordvectors")

# Load back with memory-mapping = read-only, shared across processes.
wv = KeyedVectors.load("word2vec.wordvectors", mmap='r')
# print(wv['ま'])
def document_vector(doc, wv = wv):
    """Create document vectors by averaging word vectors. Remove out-of-vocabulary words."""
    doc = [word for word in doc if word in wv.key_to_index]
    return np.mean(wv[doc], axis=0)

df.loc[:, 'doc_vector']  = df.all_tweets.apply(document_vector)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


In [19]:
X = list(df['doc_vector'])

In [20]:
clf = make_pipeline(preprocessing.StandardScaler(with_mean=False), LogisticRegression(max_iter=5000))
scores = cross_val_score(clf, X, y, cv=5, \
               scoring=make_scorer(classification_report_with_accuracy_score))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         0.0       1.00      0.01      0.02        82
         1.0       0.00      0.00      0.00        60
         4.0       0.78      1.00      0.88       507

    accuracy                           0.78       649
   macro avg       0.59      0.34      0.30       649
weighted avg       0.74      0.78      0.69       649

[[  1   0  81]
 [  0   0  60]
 [  0   0 507]]
              precision    recall  f1-score   support

         0.0       0.17      0.01      0.02        82
         1.0       0.00      0.00      0.00        59
         4.0       0.78      0.99      0.87       507

    accuracy                           0.77       648
   macro avg       0.32      0.33      0.30       648
weighted avg       0.63      0.77      0.69       648

[[  1   0  81]
 [  1   0  58]
 [  4   3 500]]
              precision    recall  f1-score   support

         0.0       0.20      0.01      0.02        81
         1.0       0.12      0.02    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### Age prediction with word2vec

In [21]:
model = Word2Vec(sentences=df_age['all_tweets'], vector_size=500, min_count=5)    
# Store just the words + their trained embeddings.
word_vectors = model.wv
word_vectors.save("word2vec_age.wordvectors")

# Load back with memory-mapping = read-only, shared across processes.
wv = KeyedVectors.load("word2vec_age.wordvectors", mmap='r')
def document_vector(doc, wv = wv):
    """Create document vectors by averaging word vectors. Remove out-of-vocabulary words."""
    doc = [word for word in doc if word in wv.key_to_index]
    return np.mean(wv[doc], axis=0)
df_age.loc[:, 'doc_vector']  = df_age.all_tweets.apply(document_vector)

In [25]:
X_age = list(df_age['doc_vector'])
len(X_age)

1145

In [23]:
df_age['all_tweets'].shape

(1145,)

In [26]:
scores = cross_val_score(clf, X_age, y_age, cv=5, \
               scoring=make_scorer(classification_report_with_accuracy_score))

              precision    recall  f1-score   support

           0       0.59      0.20      0.30        85
           1       0.66      0.92      0.77       144

    accuracy                           0.65       229
   macro avg       0.62      0.56      0.53       229
weighted avg       0.63      0.65      0.59       229

[[ 17  68   0]
 [ 12 132   0]
 [  0   0   0]]
              precision    recall  f1-score   support

           0       0.57      0.15      0.24        85
           1       0.65      0.93      0.77       144

    accuracy                           0.64       229
   macro avg       0.61      0.54      0.50       229
weighted avg       0.62      0.64      0.57       229

[[ 13  72   0]
 [ 10 134   0]
 [  0   0   0]]
              precision    recall  f1-score   support

           0       0.50      0.24      0.32        85
           1       0.66      0.86      0.74       144

    accuracy                           0.63       229
   macro avg       0.58      0.55   