In [2]:
from __future__ import division
from itertools import chain

import pandas as pd
import xml.etree.ElementTree as ET
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from scipy.sparse import dok_matrix
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix, matthews_corrcoef
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, GaussianNB,BernoulliNB
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
#from sklearn import cross_validation
from sklearn.model_selection import cross_val_score, train_test_split

<hr style="border:2px solid gray"></hr>

## Functions and Code Compiled
Functions and combined code for the text processing and model creation

In [3]:
def readDataToDataFrame(df):
    i = 0
    new_df = pd.DataFrame(columns = ["id", "tweet", "gender"])
    for k, v in df.items():
        filepath = "data/"+k+".xml"
        tree = ET.parse(filepath)
        docs = tree.findall('./documents/document')
        tweet_text = ' '.join([doc.text for doc in docs])
        new_df.loc[i] = [k, tweet_text, v]
        i += 1
        
    gender_encode = {"gender":{"male":1, "female":0}}
    new_df.replace(gender_encode, inplace = True)
    return new_df

def createTrainTestData(data, testdata, train_split = False):
    if not train_split:
        return data, testdata
    np.random.seed(3)
    msk = np.random.rand(len(data)) < 0.8
    train = data[msk].copy()
    test = data[~msk].copy()
    return train, test

stopwords_list_570 = []
with open('./stopwords_en.txt') as f:
    stopwords_list_570 = f.read().splitlines()

class LemmaTokenizer(object):
    def __init__(self):
        self.wnl=WordNetLemmatizer()
    def __call__(self,doc):
        tokenizer = RegexpTokenizer(r"\w+") 
        tokens = tokenizer.tokenize(doc)
        tokenizer = RegexpTokenizer(r"\d+")
        number_tokens = tokenizer.tokenize(doc)
        
        tokens = [w for w in tokens if w not in number_tokens]
        rmstopwords = [x for x in tokens if x not in stopwords_list_570]
        return [self.wnl.lemmatize(t) for t in rmstopwords]

vectorizer=TfidfVectorizer(analyzer='word',input='content',
                           lowercase=True,
#                            token_pattern='\w+',
                           min_df=0,
                           ngram_range=(1,1),
                           tokenizer=LemmaTokenizer())

# def getDocsLabelsList(train, test, flag = False):
#     trainDocs = train.Tweet.tolist()
#     trainLabels = train.Gender.tolist()
#     testDocs = test.Tweet.tolist()
#     if flag:
#         testLabels = test.Gender.tolist()
#     return trainDocs, trainLabels, testDocs, testLables

def showModelStats(models, x_train, y_train, x_test, y_test):
#     models = [
#     LogisticRegression(),
#     BernoulliNB(),
#     LinearSVC(),
#     RandomForestClassifier()
#     ]
    for clf in models:
        model_name = clf.__class__.__name__
        clf.fit(x_train, y_train)
        print(model_name)
        # Do the prediction
        y_predict=clf.predict(x_test)
        print(confusion_matrix(y_test,y_predict))
        recall=recall_score(y_test,y_predict,average='macro')
        precision=precision_score(y_test,y_predict,average='macro')
        f1score=f1_score(y_test,y_predict,average='macro')
        accuracy=accuracy_score(y_test,y_predict)
        matthews = matthews_corrcoef(y_test,y_predict) 
        print('Accuracy: '+ str(accuracy))
        print('Macro Precision: '+ str(precision))
        print('Macro Recall: '+ str(recall))
        print('Macro F1 score:'+ str(f1score))
        print('MCC:'+ str(matthews))

def predictValues(models, x_train, y_train, x_test):
#     models = [
#         LinearSVC(),
#     ]
    pred_values_dict = {}
    for clf in models:
        model_name = clf.__class__.__name__
        clf.fit(x_train, y_train)
        y_predict=clf.predict(x_test)
        pred_values_dict[clf.__class__.__name__] = y_predict
#     return y_predict
    return pred_values_dict


In [25]:
train_data = pd.read_csv("train_labels.csv", index_col = 0, squeeze = True).to_dict()
test_data = pd.read_csv("test.csv", index_col = 0, usecols=["id","gender"], squeeze = True).to_dict()

train_tweets = readDataToDataFrame(train_data)
test_tweets = readDataToDataFrame(test_data)

# Set train_split flag to split the training data 80:20
train, test = createTrainTestData(train_tweets, test_tweets, train_split = True) 

trainDocs = train.tweet.tolist()
trainLabels = train.gender.tolist()
testDocs = test.tweet.tolist()
testLabels = test.gender.tolist()


In [16]:
len(trainDocs)

3100

In [26]:
vectorizer=CountVectorizer(analyzer='word',input='content',
                           lowercase=True,
#                            token_pattern='\w+',
                           min_df=0,
                           ngram_range=(1,1),
                           tokenizer=LemmaTokenizer())

In [27]:
# Fit and transform training and test docs
x_train=vectorizer.fit_transform(trainDocs)
y_train=np.asarray(trainLabels)
x_test=vectorizer.transform(testDocs)
y_test=np.asarray(testLabels)

In [28]:
models = [
    LogisticRegression(),
    BernoulliNB(),
    LinearSVC(),
    RandomForestClassifier()
    ]

showModelStats(models, x_train, y_train, x_test, y_test)
#pred_dict = predictValues(models, x_train, y_train, x_test)




LogisticRegression
[[259  70]
 [ 49 227]]
Accuracy: 0.8033057851239669
Macro Precision: 0.8026094276094276
Macro Recall: 0.8048489053345668
Macro F1 score:0.8027539650576301
MCC:0.6074542048604283
BernoulliNB
[[232  97]
 [ 62 214]]
Accuracy: 0.7371900826446282
Macro Precision: 0.7386092700745893
Macro Recall: 0.7402647460464297
Macro F1 score:0.7369572410247717
MCC:0.47887115460756313
LinearSVC
[[261  68]
 [ 50 226]]
Accuracy: 0.8049586776859504
Macro Precision: 0.8039678894065665
Macro Recall: 0.8060768248094797
Macro F1 score:0.8043037280701755
MCC:0.6100410688915984




RandomForestClassifier
[[229 100]
 [ 98 178]]
Accuracy: 0.6727272727272727
Macro Precision: 0.6702967900908631
Macro Recall: 0.6704880842253644
Macro F1 score:0.6703850048428281
MCC:0.34078482062627313


In [22]:
# Select a list of predicted values from the dictionary by the model name
y_pred = pred_dict['LinearSVC']

In [24]:
# Write predicted labels to csv file
final_csv = test[['id', 'gender']]
# final_csv.rename(columns = {'ID':'id', 'Gender':'gender'}, inplace = True)
final_csv.gender = y_pred
final_csv.to_csv('pred_labels.csv', index = False)

<hr style="border:2px solid gray"> </hr>

In [4]:
train_data = pd.read_csv("train_labels.csv", index_col = 0, squeeze = True).to_dict()
some = pd.read_csv("test.csv", index_col = 0, usecols=["id","gender"], squeeze = True).to_dict()

In [None]:
test_data.head()

In [8]:
# Read test xml files
i=0
tweets_test = pd.DataFrame(columns = ["ID", "Tweet", "Gender"])
for k, v in test_data.items():
    filepath = "data/"+k+".xml"
#     print(filepath)
    tree = ET.parse(filepath)
    docs = tree.findall('./documents/document')
#     docs = tree.find('./documents/document').text
    tweet_text = ' '.join([doc.text for doc in docs])
    tweets_test.loc[i] = [k, tweet_text, v]
    i+=1
    
print(tweets_test.head())

NameError: name 'test_data' is not defined

In [None]:
tweets_test.shape

In [7]:
train_data.keys()

dict_keys(['d7d392835f50664fc079f0f388e147a0', 'ee40b86368137b86f51806c9f105b34b', '919bc742d9a22d65eab1f52b11656cab', '15b97a08d65f22d97ca685686510b6ae', 'affa98421ef5c46ca7c8f246e0a134c1', '7ebfa9227af3b76ea693c007db7eb83a', 'd8cdf701a99c9a4bbb85bb2169a4890e', '9147f96117e270f9aa92478ca1f37ccf', 'f6c5c9b5e50fab18461bb883a762d925', 'f8a4b86bcaa2b4dfaed29c14a27ce0b7', '6d29ca3ba724d43676ed46abb929dea0', '7f9f7c60389c2e63c21d612c8f17e2bf', 'bcb1a90aab31898982764f7396b4525d', 'f1375d89d477ca2512ea30843fd67e1a', '2e1f7aa4ea8de17044b17df3a50ef479', 'd560cafc83e95a7ceab43538ad31b66f', '675fefe06e2c562e56530739814483b7', '77d858eb2350274b754a338c5b76d7d9', 'b5cdf021d9b1c7d5da1ef12fc09ee405', 'ce6da8912b039169e6c25b426720defb', 'fe4aa7954b3d5798dcf3055205789056', '7b93cf81663e677fd70d4fd264f85997', '3478b8af3817646730f78710fd7cc84e', '2921fd9f757a647f92bb05caaf994abe', '5d0c6c359ac25f5a9a4a481a905c3104', '489b1c7118d2a3629ada4809eccedd65', '859a3403b4758039ae0b136b96d8149', 'bc69be41fceff0b57

In [9]:
# Read train xml files
i=0
tweets = pd.DataFrame(columns = ["ID", "Tweet", "Gender"])
for k, v in train_data.items():
    filepath = "data/"+k+".xml"
#     print(filepath)
    tree = ET.parse(filepath)
    docs = tree.findall('./documents/document')
#     docs = tree.find('./documents/document').text
    tweet_text = ' '.join([doc.text for doc in docs])
    tweets.loc[i] = [k, tweet_text, v]
    i+=1
    
print(tweets.head())

                                 ID  \
0  d7d392835f50664fc079f0f388e147a0   
1  ee40b86368137b86f51806c9f105b34b   
2  919bc742d9a22d65eab1f52b11656cab   
3  15b97a08d65f22d97ca685686510b6ae   
4  affa98421ef5c46ca7c8f246e0a134c1   

                                               Tweet  Gender  
0  @CSIFERROSCAN youch! Good things to know! Is t...    male  
1  Donald the Menace #ThanksComey  https://t.co/j...  female  
2  This seems super sketch / too good to be true:...    male  
3  Just some texts with my dad about our Saturday...  female  
4  Irrevocably love this talented human and so pr...  female  


In [None]:
Gender_encode = {"Gender":{"male":1, "female":0}}
tweets.replace(Gender_encode, inplace = True)

In [None]:
tweets.head()

In [None]:
np.random.seed(3)
msk = np.random.rand(len(tweets)) < 0.8
train = tweets[msk].copy()
test = tweets[~msk].copy()

In [None]:
train = tweets
test = tweets_test

In [None]:
train.shape
test.shape

In [None]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl=WordNetLemmatizer()
    def __call__(self,doc):
#         print(type(doc))
        tokenizer = RegexpTokenizer(r"\w+") 
        tokens = tokenizer.tokenize(doc)
        tokenizer = RegexpTokenizer(r"\d+")
        number_tokens = tokenizer.tokenize(doc)
        
        tokens = [w for w in tokens if w not in number_tokens]
#         print(len(word_tokenize(doc)))
        rmstopwords = [x for x in tokens if x not in stopwords_list_570]
#         print(len(rmstopwords))
#         return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]
        return [self.wnl.lemmatize(t) for t in rmstopwords]

In [None]:
vectorizer=TfidfVectorizer(analyzer='word',input='content',
                           lowercase=True,
#                            token_pattern='(?u)\\b\\w\\w+\\b',
#                            token_pattern='\w+',
#                            token_pattern=tokenizer,
                           min_df=0,
#                            ngram_range=(1,2),
                           ngram_range=(1,1),
                           tokenizer=LemmaTokenizer())

In [None]:
x_train=vectorizer.fit_transform(trainDocs)
y_train=np.asarray(trainLabels)

In [None]:
x_test=vectorizer.transform(testDocs)
# y_test=np.asarray(testLables)

In [None]:
print(len(vectorizer.get_feature_names()))
# print(vectorizer.get_feature_names())

In [None]:
# x_train[0, :]
print(x_train.todense())

In [None]:
trainDocs = train.Tweet.tolist()
trainLabels = train.Gender.tolist()

In [None]:
testDocs = test.Tweet.tolist()
# testLables = test.Gender.tolist()

In [None]:
print(trainDocs)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
trainDocs = tweets.Tweet.tolist()
trainLabels = tweets.Gender.tolist()

In [None]:
raw = trainDocs[0]

In [None]:
tokenizer = RegexpTokenizer(r"\w+") 
tokens = tokenizer.tokenize(raw)

In [None]:
len(tokens)

In [None]:
tokenizer = RegexpTokenizer(r"\s+", gaps = True) 
tokens = tokenizer.tokenize(raw)

In [None]:
tokens[:50]

In [None]:
print('VC93gMCzK5' in tokens)

In [None]:
stopwords_list_570 = []
with open('./stopwords_en.txt') as f:
    stopwords_list_570 = f.read().splitlines()

In [None]:
filtered_tokens = [w for w in tokens if w.lower() not in stopwords_list_570]
filtered_tokens

In [None]:
len(filtered_tokens)

In [None]:
tokenizer = RegexpTokenizer(r"\d+")
number_tokens = tokenizer.tokenize(raw)

In [None]:
len(number_tokens)

In [None]:
tokens[:30]

In [None]:
[x for x in data.items()][:5]

In [None]:
filtered_tokens_2 = [w for w in filtered_tokens if w not in number_tokens]
len(filtered_tokens_2)

In [None]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
print(['{0} -> {1}'.format(w, lemmatizer.lemmatize(w)) for w in filtered_tokens_2])
lemmatized_tokens = [lemmatizer.lemmatize(w) for w in filtered_tokens_2]

In [None]:
from nltk.probability import *
fd_1 = FreqDist(lemmatized_tokens)

In [None]:
len(lemmatized_tokens)
lemmatized_tokens[:5]

In [None]:
fd_1.most_common(25)

In [None]:
fd_1.plot(25, cumulative=False)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(analyzer = "word")
tfs = tfidf.fit_transform(lemmatized_tokens)
tfs.shape

In [None]:
vocab = tfidf.get_feature_names()
# for word, weight in zip(vocab, tfs.toarray()[0]):
#     if weight > 0:
#         print (word, ":", weight)
# print(len(vocab))
# print(vocab)

In [None]:
# Training: We use 3 different models and 5 fold cross validation to see which one is better
models = [
    LogisticRegression(),
    BernoulliNB(),
    LinearSVC(),
    RandomForestClassifier()
]
CV = 10
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
     model_name = model.__class__.__name__
     accuracies = cross_val_score(model, x_train, y_train, scoring='accuracy', cv=CV)
     for fold_idx, accuracy in enumerate(accuracies):
          entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
#cv_df
import seaborn as sns
import matplotlib.pyplot as plt
sns.boxplot(x='model_name', y='accuracy', data=cv_df)
sns.stripplot(x='model_name', y='accuracy', data=cv_df, 
              size=8, jitter=True, edgecolor="gray", linewidth=2)
plt.show()


In [67]:
from scipy.sparse import dok_matrix
import numpy as np
import scipy as sp
from scipy.sparse import csr_matrix, issparse

In [76]:
X = np.array(x_test)
X_train = sp.sparse.hstack([x_train]).toarray()


In [75]:
lda = LinearDiscriminantAnalysis()
model = lda.fit(X_train, y_train)

KeyboardInterrupt: 

In [54]:
models = [
    LogisticRegression(),
    BernoulliNB(),
    LinearSVC(),
    RandomForestClassifier(),
    KNeighborsClassifier(n_neighbors=1),
    LinearDiscriminantAnalysis()
]

for clf in models:
    model_name = clf.__class__.__name__
    clf.fit(x_train, y_train)
    print(model_name)
    # Do the prediction
    y_predict=clf.predict(x_test)
    print(confusion_matrix(y_test,y_predict))
    recall=recall_score(y_test,y_predict,average='macro')
    precision=precision_score(y_test,y_predict,average='macro')
    f1score=f1_score(y_test,y_predict,average='macro')
    accuracy=accuracy_score(y_test,y_predict)
    matthews = matthews_corrcoef(y_test,y_predict) 
    print('Accuracy: '+ str(accuracy))
    print('Macro Precision: '+ str(precision))
    print('Macro Recall: '+ str(recall))
    print('Macro F1 score:'+ str(f1score))
    print('MCC:'+ str(matthews))




LogisticRegression
[[228 101]
 [ 51 225]]
Accuracy: 0.7487603305785124
Macro Precision: 0.7536941750775117
Macro Recall: 0.7541132549226905
Macro F1 score:0.7487541528239202
MCC:0.5078072570724977
BernoulliNB
[[232  97]
 [ 62 214]]
Accuracy: 0.7371900826446282
Macro Precision: 0.7386092700745893
Macro Recall: 0.7402647460464297
Macro F1 score:0.7369572410247717
MCC:0.47887115460756313
LinearSVC
[[246  83]
 [ 49 227]]
Accuracy: 0.7818181818181819
Macro Precision: 0.7830781848004373
Macro Recall: 0.7850920664287917
Macro F1 score:0.7816027828826464
MCC:0.5681666821124521




RandomForestClassifier
[[233  96]
 [100 176]]
Accuracy: 0.6760330578512397
Macro Precision: 0.6733792616145557
Macro Recall: 0.6729439231751905
Macro F1 score:0.6731316294352436
MCC:0.3463229111730206
KNeighborsClassifier
[[151 178]
 [ 84 192]]
Accuracy: 0.5669421487603306
Macro Precision: 0.5807360552041403
Macro Recall: 0.5773093696312938
Macro F1 score:0.5649441187449223
MCC:0.15800827237014425


TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.

In [None]:
models = [
    QuadraticDiscriminantAnalysis(),
    LinearDiscriminantAnalysis()
]

for clf in models:
    model_name = clf.__class__.__name__
    clf.fit(X_train, y_train)
    print(model_name)
    # Do the prediction
    y_predict=clf.predict(X)
    print(confusion_matrix(y_test,y_predict))
    recall=recall_score(y_test,y_predict,average='macro')
    precision=precision_score(y_test,y_predict,average='macro')
    f1score=f1_score(y_test,y_predict,average='macro')
    accuracy=accuracy_score(y_test,y_predict)
    matthews = matthews_corrcoef(y_test,y_predict) 
    print('Accuracy: '+ str(accuracy))
    print('Macro Precision: '+ str(precision))
    print('Macro Recall: '+ str(recall))
    print('Macro F1 score:'+ str(f1score))
    print('MCC:'+ str(matthews))



In [None]:
models = [
    LinearSVC(),
]

for clf in models:
    model_name = clf.__class__.__name__
    clf.fit(x_train, y_train)
    y_predict=clf.predict(x_test)

y_predict

In [None]:
# tweets_test.head()
# type(y_predict)
final_csv = tweets_test[['ID','Gender']]
final_csv.rename(columns = {'ID':'id', 'Gender':'gender'}, inplace = True)
final_csv.gender = y_predict
final_csv.head()


In [None]:
final_csv.to_csv('pred_labels.csv', index = False)

In [None]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
# print(y_test.shape)