In [1]:
import pandas as pd
import numpy as np

import sklearn 
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df = pd.read_csv('./data/preprocessed_tweets_with_race&age.csv')
df

Unnamed: 0.1,Unnamed: 0,user_id,all_tweets,is_female,year_born,race,age
0,0,12488,ykar futuristic sans serif font who can contac...,0.0,1980.0,4.0,1
1,1,719703,other words good news about the vaccine safety...,0.0,1985.0,4.0,1
2,2,749003,would fair call lil nas the first successful o...,0.0,1982.0,5.0,1
3,3,822540,bonk nice mcboy oos getting real tired games...,0.0,1979.0,4.0,1
4,4,865071,how about pizza dipped water day quarantine in...,0.0,1995.0,4.0,1
...,...,...,...,...,...,...,...
3266,3270,3196361888,back someone called hungry who back watch sock...,1.0,1995.0,1.0,1
3267,3272,3352812676,women guide burn fat and build muscle the holy...,1.0,1973.0,4.0,1
3268,3273,3924536853,even though school cancelled and grades don ma...,1.0,1993.0,4.0,1
3269,3274,4281628276,and what are you drunk hillary supporter women...,0.0,1987.0,4.0,1


In [3]:
## stem 
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

In [4]:
all_stem_wrds = []
for txt in df['all_tweets']:
    wrds = txt.split()
    stem_wrds = []
    for i in wrds:
        stem_wrds.append(stemmer.stem(i))
    
    str1 = ' '.join(stem_wrds)
    all_stem_wrds.append(str1)

df['stemmed_tweets'] = all_stem_wrds

In [5]:
print(df['stemmed_tweets'][0])
print('##########')
print(df['all_tweets'][0])

ykar futurist san serif font who can contact about the veri rude and poor servic experienc dure current stay pleas and thank you like win now head bth floor check thi out thank you ye do now pleas help got scam and need help find who need talk thank you wa have fun with new friend befor the invas slider fossil you develop and you use chrome you have check out initab check out thi custom mosaic creat with the pictur mosaic onlin mosaic tool cmd middl max min diagon true cmd middl max min diagon true revers true cmd vertic true max min smooth true method uniqu use instagram for io add new featur instagram just complet the howard wolowitz achiev download just complet the idl crafter achiev download just discov the cook red meat recip download just discov the fox recip download just discov the light fur bag recip download just discov the reptil soup recip download just discov the brush recip download just discov the trowel head recip download just complet the god they kill minion you basta

In [6]:
vectorizer = TfidfVectorizer(stop_words='english', max_features = 5000)

X = vectorizer.fit_transform(df['stemmed_tweets'])
print(X.shape)

(3271, 5000)


label formatting replace race code NaN with 0.0

In [7]:
df['race'] = df['race'].replace(np.nan, 0)

In [8]:
df['race'].value_counts()

4.0    2534
1.0     299
2.0     186
5.0     120
3.0     102
0.0      30
Name: race, dtype: int64

4 is white, 1 is black, and merge all other race to 0

In [9]:
df['race'] = np.where(
   (df['race'] != 4) & (df['race'] != 1) , 0.0, df['race']
   )
print(df['race'].value_counts())
y = df['race']

4.0    2534
0.0     438
1.0     299
Name: race, dtype: int64


### logistic regression + TFIDF vectorizor & cross validation

In [10]:
from sklearn import metrics, preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

In [11]:
from sklearn.metrics import classification_report, accuracy_score, make_scorer, confusion_matrix

def classification_report_with_accuracy_score(y_true, y_pred):
    print(classification_report(y_true, y_pred)) # print classification report
    print(confusion_matrix(y_true, y_pred, labels = [0,1,4]))
    return accuracy_score(y_true, y_pred) # return accuracy score

In [12]:
clf = make_pipeline(preprocessing.StandardScaler(with_mean=False), LogisticRegression(max_iter=500))
scores = cross_val_score(clf, X, y, cv=5, \
               scoring=make_scorer(classification_report_with_accuracy_score))

              precision    recall  f1-score   support

         0.0       0.19      0.11      0.14        88
         1.0       0.40      0.27      0.32        60
         4.0       0.81      0.90      0.85       507

    accuracy                           0.74       655
   macro avg       0.47      0.43      0.44       655
weighted avg       0.69      0.74      0.71       655

[[ 10   8  70]
 [  8  16  36]
 [ 34  16 457]]
              precision    recall  f1-score   support

         0.0       0.28      0.19      0.23        88
         1.0       0.39      0.32      0.35        59
         4.0       0.82      0.88      0.85       507

    accuracy                           0.74       654
   macro avg       0.50      0.47      0.48       654
weighted avg       0.71      0.74      0.72       654

[[ 17   8  63]
 [  7  19  33]
 [ 37  22 448]]
              precision    recall  f1-score   support

         0.0       0.23      0.13      0.16        87
         1.0       0.40      0.32    

In [13]:
sklearn.__version__

'1.0'

### word2vec + Logistic Regression

In [14]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [23]:
model = Word2Vec(sentences=df['all_tweets'], vector_size=500, min_count=5)    
# Store just the words + their trained embeddings.
word_vectors = model.wv
word_vectors.save("word2vec.wordvectors")

In [24]:
# Load back with memory-mapping = read-only, shared across processes.
wv = KeyedVectors.load("word2vec.wordvectors", mmap='r')
# print(wv['ま'])
def document_vector(doc, wv = wv):
    """Create document vectors by averaging word vectors. Remove out-of-vocabulary words."""
    doc = [word for word in doc if word in wv.key_to_index]
    return np.mean(wv[doc], axis=0)

In [25]:
df['doc_vector'] = df.all_tweets.apply(document_vector)

In [26]:
X = list(df['doc_vector'])

In [27]:
clf = make_pipeline(preprocessing.StandardScaler(with_mean=False), LogisticRegression(max_iter=5000))
scores = cross_val_score(clf, X, y, cv=5, \
               scoring=make_scorer(classification_report_with_accuracy_score))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         0.0       1.00      0.01      0.02        88
         1.0       0.00      0.00      0.00        60
         4.0       0.78      1.00      0.87       507

    accuracy                           0.78       655
   macro avg       0.59      0.34      0.30       655
weighted avg       0.73      0.78      0.68       655

[[  1   0  87]
 [  0   0  60]
 [  0   0 507]]
              precision    recall  f1-score   support

         0.0       0.29      0.02      0.04        88
         1.0       0.00      0.00      0.00        59
         4.0       0.77      0.98      0.87       507

    accuracy                           0.77       654
   macro avg       0.35      0.34      0.30       654
weighted avg       0.64      0.77      0.68       654

[[  2   0  86]
 [  0   0  59]
 [  5   3 499]]
              precision    recall  f1-score   support

         0.0       0.20      0.01      0.02        87
         1.0       0.11      0.02    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
