In [None]:
from __future__ import division
from itertools import chain

import pandas as pd
import xml.etree.ElementTree as ET
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from scipy.sparse import dok_matrix
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix, matthews_corrcoef
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, GaussianNB,BernoulliNB
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
#from sklearn import cross_validation
from sklearn.model_selection import cross_val_score, train_test_split

<hr style="border:2px solid gray"></hr>

## Functions and Code Compiled
Functions and combined code for the text processing and model creation

In [None]:
def readDataToDataFrame(df):
    i = 0
    new_df = pd.DataFrame(columns = ["id", "tweet", "gender"])
    for k, v in df.items():
        filepath = "data/"+k+".xml"
        tree = ET.parse(filepath)
        docs = tree.findall('./documents/document')
        tweet_text = ' '.join([doc.text for doc in docs])
        new_df.loc[i] = [k, tweet_text, v]
        i += 1
        
    gender_encode = {"gender":{"male":1, "female":0}}
    new_df.replace(gender_encode, inplace = True)
    return new_df

def createTrainTestData(data, testdata, train_split = False):
    if not train_split:
        return data, testdata
    np.random.seed(3)
    msk = np.random.rand(len(data)) < 0.8
    train = data[msk].copy()
    test = data[~msk].copy()
    return train, test

stopwords_list_570 = []
with open('./stopwords_en.txt') as f:
    stopwords_list_570 = f.read().splitlines()

class LemmaTokenizer(object):
    def __init__(self):
        self.wnl=WordNetLemmatizer()
    def __call__(self,doc):
        tokenizer = RegexpTokenizer(r"\w+") 
        tokens = tokenizer.tokenize(doc)
        tokenizer = RegexpTokenizer(r"\d+")
        number_tokens = tokenizer.tokenize(doc)
        
        tokens = [w for w in tokens if w not in number_tokens]
        rmstopwords = [x for x in tokens if x not in stopwords_list_570]
        return [self.wnl.lemmatize(t) for t in rmstopwords]

vectorizer=TfidfVectorizer(analyzer='word',input='content',
                           lowercase=True,
#                            token_pattern='\w+',
                           min_df=0,
                           ngram_range=(1,1),
                           tokenizer=LemmaTokenizer())

# def getDocsLabelsList(train, test, flag = False):
#     trainDocs = train.Tweet.tolist()
#     trainLabels = train.Gender.tolist()
#     testDocs = test.Tweet.tolist()
#     if flag:
#         testLabels = test.Gender.tolist()
#     return trainDocs, trainLabels, testDocs, testLables

def showModelStats(models, x_train, y_train, x_test, y_test):
#     models = [
#     LogisticRegression(),
#     BernoulliNB(),
#     LinearSVC(),
#     RandomForestClassifier()
#     ]
    for clf in models:
        model_name = clf.__class__.__name__
        clf.fit(x_train, y_train)
        print(model_name)
        # Do the prediction
        y_predict=clf.predict(x_test)
        print(confusion_matrix(y_test,y_predict))
        recall=recall_score(y_test,y_predict,average='macro')
        precision=precision_score(y_test,y_predict,average='macro')
        f1score=f1_score(y_test,y_predict,average='macro')
        accuracy=accuracy_score(y_test,y_predict)
        matthews = matthews_corrcoef(y_test,y_predict) 
        print('Accuracy: '+ str(accuracy))
        print('Macro Precision: '+ str(precision))
        print('Macro Recall: '+ str(recall))
        print('Macro F1 score:'+ str(f1score))
        print('MCC:'+ str(matthews))

def predictValues(models, x_train, y_train, x_test):
#     models = [
#         LinearSVC(),
#     ]
    pred_values_dict = {}
    for clf in models:
        model_name = clf.__class__.__name__
        clf.fit(x_train, y_train)
        y_predict=clf.predict(x_test)
        pred_values_dict[clf.__class__.__name__] = y_predict
#     return y_predict
    return pred_values_dict


In [None]:
train_data = pd.read_csv("train_labels.csv", index_col = 0, squeeze = True).to_dict()
test_data = pd.read_csv("test.csv", index_col = 0, usecols=["id","gender"], squeeze = True).to_dict()

train_tweets = readDataToDataFrame(train_data)
test_tweets = readDataToDataFrame(test_data)

# Set train_split flag to split the training data 80:20
train, test = createTrainTestData(train_tweets, test_tweets, train_split = True) 

trainDocs = train.tweet.tolist()
trainLabels = train.gender.tolist()
testDocs = test.tweet.tolist()
testLabels = test.gender.tolist()


In [None]:
len(trainDocs)

In [None]:
vectorizer=CountVectorizer(analyzer='word',input='content',
                           lowercase=True,
#                            token_pattern='\w+',
                           min_df=0,
                           ngram_range=(1,1),
                           tokenizer=LemmaTokenizer())

In [None]:
import nltk
nltk.download('wordnet')

In [None]:
# Fit and transform training and test docs
x_train=vectorizer.fit_transform(trainDocs)
y_train=np.asarray(trainLabels)
x_test=vectorizer.transform(testDocs)
y_test=np.asarray(testLabels)

In [None]:
models = [
    LogisticRegression(),
    BernoulliNB(),
    LinearSVC(),
    RandomForestClassifier()
    ]

showModelStats(models, x_train, y_train, x_test, y_test)
pred_dict = predictValues(models, x_train, y_train, x_test)


In [None]:
# Select a list of predicted values from the dictionary by the model name
y_pred = pred_dict['LinearSVC']

In [None]:
# Write predicted labels to csv file
final_csv = test[['id', 'gender']]
# final_csv.rename(columns = {'ID':'id', 'Gender':'gender'}, inplace = True)
final_csv.gender = y_pred
final_csv.to_csv('pred_labels.csv', index = False)

<hr style="border:2px solid gray"> </hr>