In [1]:
import pandas as pd
import numpy as np

import sklearn 
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
race_df = pd.read_csv('./data/preprocessed_race_tweets.csv', index_col = 0)
race_df

Unnamed: 0,text,label,user_id
0,ykar futuristic sans serif font,4.0,12488
1,other words good news about the vaccine safety...,4.0,719703
4,how about pizza dipped water,4.0,865071
5,hire better programmers your website dumpster ...,4.0,988211
6,walking home from the adella wonders the raven...,4.0,1025311
...,...,...,...
327593,username danisonbottom wattpad,2.0,3178803853
327594,like going summer shopping today,1.0,3196361888
327596,what the best for guide nutritional needs heal...,4.0,3352812676
327597,freakin panthers,4.0,3924536853


In [3]:
## drop NaN
race_df.dropna(inplace=True)

In [4]:
race_df['label'].value_counts()

4.0    224287
1.0     26269
2.0     16292
3.0      9189
Name: label, dtype: int64

In [45]:
# stem 
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

all_stem_wrds = []
for txt in df['text']:
    wrds = txt.split()
    stem_wrds = []
    for i in wrds:
        stem_wrds.append(stemmer.stem(i))
    
    str1 = ' '.join(stem_wrds)
    all_stem_wrds.append(str1)

df['text'] = all_stem_wrds

### Sampling from each label
1: Black, 2: Latino/Hspanic, 3: Asian, 4: White

In [5]:
freq = pd.DataFrame({'label':[1, 2, 3, 4],
                     'nostoextract':[9000, 9000, 9000, 9000], })

def bootstrap(data, freq):
    freq = freq.set_index('label')

    # This function will be applied on each group of instances of the same
    # class in `data`.
    def sampleClass(classgroup):
        cls = classgroup['label'].iloc[0]
        nDesired = freq.nostoextract[cls]
        nRows = len(classgroup)

        nSamples = min(nRows, nDesired)
        return classgroup.sample(nSamples)

    samples = data.groupby('label').apply(sampleClass)

    # If you want a new index with ascending values
    # samples.index = range(len(samples))

    # If you want an index which is equal to the row in `data` where the sample
    # came from
    samples.index = samples.index.get_level_values(1)

    # If you don't change it then you'll have a multiindex with level 0
    # being the class and level 1 being the row in `data` where
    # the sample came from.

    return samples

sampled_race_df = bootstrap(race_df,freq)

In [6]:
## stem 
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

In [7]:
stem_list = []
for txt in sampled_race_df['text']:
    wrds = txt.split()
    stem_wrds = []
    
    for i in wrds:
        stem_wrds.append(stemmer.stem(i))
    
    str1 = ' '.join(stem_wrds)  
    stem_list.append(str1)

sampled_race_df['text'] = stem_list

In [8]:
vectorizer = TfidfVectorizer(stop_words='english', max_features = 5000)

X = vectorizer.fit_transform(sampled_race_df['text'])
print(X.shape)
y = sampled_race_df['label']

(36000, 5000)


### logistic regression + TFIDF vectorizor & cross validation

In [9]:
from sklearn import metrics, preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

In [10]:
from sklearn.metrics import classification_report, accuracy_score, make_scorer, confusion_matrix

def classification_report_with_accuracy_score(y_true, y_pred):
    print(classification_report(y_true, y_pred)) # print classification report
    # print(confusion_matrix(y_true, y_pred, labels = [1,2,3,4], normalize='true'))
    return accuracy_score(y_true, y_pred) # return accuracy score

#### classification report for race

In [11]:
clf = make_pipeline(preprocessing.StandardScaler(with_mean=False), LogisticRegression(max_iter=500))
scores = cross_val_score(clf, X, y, cv=5, \
               scoring=make_scorer(classification_report_with_accuracy_score))

              precision    recall  f1-score   support

         1.0       0.38      0.38      0.38      1800
         2.0       0.33      0.31      0.32      1800
         3.0       0.37      0.42      0.39      1800
         4.0       0.34      0.32      0.33      1800

    accuracy                           0.36      7200
   macro avg       0.36      0.36      0.36      7200
weighted avg       0.36      0.36      0.36      7200

              precision    recall  f1-score   support

         1.0       0.38      0.37      0.38      1800
         2.0       0.34      0.33      0.34      1800
         3.0       0.37      0.41      0.39      1800
         4.0       0.33      0.31      0.32      1800

    accuracy                           0.36      7200
   macro avg       0.36      0.36      0.35      7200
weighted avg       0.36      0.36      0.35      7200

              precision    recall  f1-score   support

         1.0       0.39      0.37      0.38      1800
         2.0       0.

#### classification report for age

In [12]:
age_df = pd.read_csv('./data/preprocessed_tweets_with_for_age_pred.csv',  lineterminator='\n')
## drop NaN
age_df.dropna(inplace=True)

In [None]:
stem_list = []
for txt in sampled_age_df['text']:
    wrds = txt.split()
    stem_wrds = []
    
    for i in wrds:
        stem_wrds.append(stemmer.stem(i))
    
    str1 = ' '.join(stem_wrds)  
    stem_list.append(str1)

sampled_age_df['text'] = stem_list

In [13]:
## sample from age_df
freq = pd.DataFrame({'label':[0, 1],
                     'nostoextract':[36115, 36115], })
sampled_age_df = bootstrap(age_df,freq)

In [14]:
all_stem_wrds = []
for txt in sampled_age_df['text']:
    wrds = txt.split()
    stem_wrds = []
    for i in wrds:
        stem_wrds.append(stemmer.stem(i))
    
    str1 = ' '.join(stem_wrds)
    all_stem_wrds.append(str1)

sampled_age_df['text'] = all_stem_wrds

In [15]:
sampled_age_df['label'].value_counts()

0    36115
1    36115
Name: label, dtype: int64

In [16]:
X_age = vectorizer.fit_transform(sampled_age_df['text'])
print(X_age.shape)
y_age = sampled_age_df['label']

(72230, 5000)


In [17]:
scores = cross_val_score(clf, X_age, y_age, cv=5, \
               scoring=make_scorer(classification_report_with_accuracy_score))

              precision    recall  f1-score   support

           0       0.56      0.61      0.58      7223
           1       0.57      0.52      0.54      7223

    accuracy                           0.56     14446
   macro avg       0.56      0.56      0.56     14446
weighted avg       0.56      0.56      0.56     14446

              precision    recall  f1-score   support

           0       0.55      0.62      0.58      7223
           1       0.57      0.50      0.53      7223

    accuracy                           0.56     14446
   macro avg       0.56      0.56      0.56     14446
weighted avg       0.56      0.56      0.56     14446

              precision    recall  f1-score   support

           0       0.56      0.61      0.58      7223
           1       0.57      0.51      0.54      7223

    accuracy                           0.56     14446
   macro avg       0.56      0.56      0.56     14446
weighted avg       0.56      0.56      0.56     14446

              preci

### word2vec + Logistic Regression

In [18]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

#### race prediction with word2vec

In [32]:
from gensim.test.utils import common_texts
common_texts[:10]

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

In [34]:
wrd_ls = []
for s in sampled_race_df['text'].to_list():
    wrd_ls.append(s.split())
    
wrd_ls[:3]

[['you',
  'play',
  'volleybal',
  'too',
  'you',
  'are',
  'more',
  'than',
  'tripl',
  'threat',
  'your',
  'the',
  'next',
  'level',
  'woman'],
 ['ye', 'and', 'great', 'see', 'real', 'life', 'candyland'],
 ['everybodi',
  'wanna',
  'leav',
  'facebook',
  'for',
  'twitter',
  'nah',
  'keep',
  'all',
  'ass',
  'over',
  'there']]

In [47]:
model = Word2Vec(sentences=wrd_ls, vector_size=500)    
# Store just the words + their trained embeddings.
word_vectors = model.wv
word_vectors.save("word2vec.wordvectors")

In [None]:
# Load back with memory-mapping = read-only, shared across processes.
wv = KeyedVectors.load("word2vec.wordvectors", mmap='r')

print(wv.key_to_index)

In [50]:
wv["<UNK>"] = np.random.rand(500) # 500 is the vectors length

In [52]:
def document_vector(doc, wv = wv):
    """Create document vectors by averaging word vectors. Remove out-of-vocabulary words."""
    doc = [word if word in wv.key_to_index else "<UNK>" for word in doc ]
    return np.mean(wv[doc], axis=0)

sampled_race_df.loc[:, 'doc_vector']  = [document_vector(s) for s in wrd_ls]

In [53]:
X = sampled_race_df['doc_vector'].to_list()
y = sampled_race_df['label'].to_list()

In [55]:
clf = make_pipeline(preprocessing.StandardScaler(with_mean=False), LogisticRegression(max_iter=2000, solver='newton-cg'))
scores = cross_val_score(clf, X, y, cv=5, \
               scoring=make_scorer(classification_report_with_accuracy_score))

              precision    recall  f1-score   support

         1.0       0.34      0.34      0.34      1800
         2.0       0.30      0.26      0.27      1800
         3.0       0.32      0.33      0.32      1800
         4.0       0.30      0.33      0.31      1800

    accuracy                           0.31      7200
   macro avg       0.31      0.31      0.31      7200
weighted avg       0.31      0.31      0.31      7200

              precision    recall  f1-score   support

         1.0       0.33      0.35      0.34      1800
         2.0       0.32      0.24      0.28      1800
         3.0       0.33      0.35      0.34      1800
         4.0       0.29      0.33      0.31      1800

    accuracy                           0.32      7200
   macro avg       0.32      0.32      0.32      7200
weighted avg       0.32      0.32      0.32      7200

              precision    recall  f1-score   support

         1.0       0.33      0.37      0.35      1800
         2.0       0.



              precision    recall  f1-score   support

         1.0       0.35      0.36      0.35      1800
         2.0       0.32      0.27      0.29      1800
         3.0       0.32      0.31      0.32      1800
         4.0       0.30      0.35      0.33      1800

    accuracy                           0.32      7200
   macro avg       0.32      0.32      0.32      7200
weighted avg       0.32      0.32      0.32      7200

              precision    recall  f1-score   support

         1.0       0.35      0.38      0.36      1800
         2.0       0.31      0.27      0.29      1800
         3.0       0.32      0.33      0.32      1800
         4.0       0.31      0.33      0.32      1800

    accuracy                           0.32      7200
   macro avg       0.32      0.32      0.32      7200
weighted avg       0.32      0.32      0.32      7200





#### Age prediction with word2vec

In [56]:
model = Word2Vec(sentences=sampled_age_df['text'], vector_size=500)    
# Store just the words + their trained embeddings.
word_vectors = model.wv
word_vectors.save("word2vec_age.wordvectors")

# Load back with memory-mapping = read-only, shared across processes.
wv = KeyedVectors.load("word2vec_age.wordvectors", mmap='r')
wv["<UNK>"] = np.random.rand(500) # 500 is the vectors length
sampled_age_df.loc[:, 'doc_vector']  = sampled_age_df.text.apply(document_vector)

In [25]:
X_age = list(sampled_age_df['doc_vector'])
len(X_age)

1145

In [57]:
scores = cross_val_score(clf, X_age, y_age, cv=5, \
               scoring=make_scorer(classification_report_with_accuracy_score))

              precision    recall  f1-score   support

           0       0.56      0.61      0.58      7223
           1       0.57      0.52      0.54      7223

    accuracy                           0.56     14446
   macro avg       0.56      0.56      0.56     14446
weighted avg       0.56      0.56      0.56     14446

              precision    recall  f1-score   support

           0       0.55      0.62      0.58      7223
           1       0.57      0.50      0.53      7223

    accuracy                           0.56     14446
   macro avg       0.56      0.56      0.56     14446
weighted avg       0.56      0.56      0.56     14446

              precision    recall  f1-score   support

           0       0.56      0.61      0.58      7223
           1       0.57      0.51      0.54      7223

    accuracy                           0.56     14446
   macro avg       0.56      0.56      0.56     14446
weighted avg       0.56      0.56      0.56     14446

              preci