### Social identity classification using Machine Learning

#### Loading required packages

* Pandas
* SKlearn
* Numpy
* NLTK
* BeautifulSoup
* RE (Regular Experssions)

Anaconda Python distribution will widely used packages can be downloaded from here: https://www.continuum.io/downloads

Available package list can be seen here: https://docs.continuum.io/anaconda/pkg-docs

In [1]:
import pandas as pd 
import sklearn
import numpy as np
import nltk

import re
from bs4 import BeautifulSoup  

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn import tree
from sklearn import svm
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import FeatureUnion
from sklearn.feature_selection import SelectFromModel

from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix

In [2]:
# This method perform text pre processing like lower casing and stop word removal and returned clean sentence in array.
def preProcessing(features):
    num_titles = features.size
    clean_titles = []
    porter = nltk.PorterStemmer()
    stops = set(stopwords.words('english'))
    for i in range( 0, num_titles):
        text = BeautifulSoup(features[i], "lxml").get_text() 
        #letters_only = re.sub("[^a-zA-Z]", " ", text) 
        words = text.lower().split()
        clean_words = [w.lower() for w in words if not w in stops]  
        #stemmed_words = [porter.stem(w) for w in clean_words]
        clean_titles.append(" ".join(clean_words))
    return clean_titles

# This method will perform feature extraction using TF-IDF and return the matrix and vectorizer that will be used to 
# transform new test examples before doing classification
def getDTMByTFIDF(features,nfeatures):
    tfIdf_vectorizer = TfidfVectorizer(max_features=nfeatures)
    dtm = tfIdf_vectorizer.fit_transform(features).toarray()
    return dtm,tfIdf_vectorizer

# This method will take TF-IDF matrix along with class labels and output highly informative features selected using Chi^2 
# along with Chi^2 model that will be used to transform new test examples before classification
def featuresByChiSq(features,labels,nFeature=1000):
    chi2_model = SelectKBest(chi2,k=nFeature)
    dtm = chi2_model.fit_transform(features,labels)
    return dtm,chi2_model

# This method perform LSA/LSI and return LSA components (new learned features using LSI) along with LSA model
def featuresByLSA(features,ncomponents=100):
    svd = TruncatedSVD(n_components=ncomponents)
    normalizer =  Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)
    dtm_lsa = lsa.fit_transform(features)
    return dtm_lsa, lsa

# This method takes document term matrix (usually output from Chi^2 or LSA), class labels and classifier object and number of folds. 
# It performs stratified CV and ouput average precision, recall and f-score.
def crossValidate(document_term_matrix,labels,classifier,nfold=2):
    precision = []
    recall = []
    fscore = []
    skf = StratifiedKFold(labels, n_folds=nfold)  
    for train_index, test_index in skf:
        X_train, X_test = document_term_matrix[train_index], document_term_matrix[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        X_train_selected, chi2_model = featuresByChiSq(X_train,y_train,1000)       
        X_test_selected = chi2_model.transform(X_test)
        classifier.fit(X_train_selected, y_train)
        y_pred = classifier.predict(X_test_selected)
        p,r,f,s = precision_recall_fscore_support(y_test, y_pred)
        precision.append(p)
        recall.append(r)
        fscore.append(f)     
    return (round(np.mean(precision),3),round(np.mean(recall),3),round(np.mean(fscore),3))

# This method pretty print the confusion matrix. Extracted from some Github repo, forgot the name.
def print_cm(cm, labels, hide_zeroes=False, hide_diagonal=False, hide_threshold=None):
    columnwidth = max([len(x) for x in labels]+[5]) # 5 is value length
    empty_cell = " " * columnwidth
    # Print header
    print ("    " + empty_cell,)
    for label in labels: 
        print ("%{0}s".format(columnwidth) % label,)
    print
    # Print rows
    for i, label1 in enumerate(labels):
        print ("    %{0}s".format(columnwidth) % label1,)
        for j in range(len(labels)): 
            cell = "%{0}.1f".format(columnwidth) % cm[i, j]
            if hide_zeroes:
                cell = cell if float(cm[i, j]) != 0 else empty_cell
            if hide_diagonal:
                cell = cell if i != j else empty_cell
            if hide_threshold:
                cell = cell if cm[i, j] > hide_threshold else empty_cell
            print (cell,)
        print

In [3]:
# Reading training set using Pandas (Make sure to save Excel file first as CSV, otherwise this will fail)
training_data = pd.read_csv('Train5_dataset.csv', header=0, encoding='latin-1')
features = training_data['from_user_description']

relational_labels = training_data['Relational']
occupation_labels = training_data['Occupation']
political_labels = training_data['Political']
ethnic_labels = training_data['Ethnic/religion']
stigma_labels = training_data['Stigma']

In [4]:
# Reading testing set using Pandas (Make sure to save Excel file first as CSV, otherwise this will fail)
testing_data = pd.read_csv('Test5_dataset.csv', header=0, encoding='latin-1')
testing_features = testing_data['from_user_description']

testing_relational_labels = testing_data['Relational']
testing_occupation_labels = testing_data['Occupation']
testing_political_labels = testing_data['Political']
testing_ethnic_labels = testing_data['Ethnic/religion']
testing_stigma_labels = testing_data['Stigma']

In [None]:
# Text preprocessing of training and test set. Also creating TF-IDF matrix for training set
processed_features = preProcessing(features)
document_term_matrix,tfidf_vectorizer = getDTMByTFIDF(processed_features,None)
test_processed_features = preProcessing(testing_features)

This code is written to quickly perform different experiements, to select the best optimal model for each of the class labels i.e Relational & Political etc.

##### Perform Chi^2 feature selection using different number of features and cross validate the results
For the 10 fold cross validation output

In [None]:
#Use differenet features selected via chi^2 feature selection and perform cross validation

def training_id(name, lb, clf):
    n_features = (1000,)
    for n in n_features:
        #selected_dtm, chi2_model = featuresByChiSq(document_term_matrix,lb,n)
        evaluation_results = crossValidate(document_term_matrix,lb,clf,nfold=10)
        print (name, n,)
        print ("{0}\t{1}\t{2}".format(*evaluation_results))


lr = LogisticRegression(class_weight="balanced")
bnb = BernoulliNB()
rf = RandomForestClassifier(n_estimators = 100,class_weight="balanced_subsample")
svmc = svm.LinearSVC(C=0.5,class_weight="balanced",max_iter=2000)

training_id("relational SVM", relational_labels, svmc)
training_id("relational BNB", relational_labels, bnb)
training_id("relational LR", relational_labels, lr)
training_id("relational RF", relational_labels, rf)
training_id("occupation SVM", occupation_labels, svmc)
training_id("occupation BNB", occupation_labels, bnb)
training_id("occupation LR", occupation_labels, lr)
training_id("occupation RF", occupation_labels, rf)
training_id("political SVM ", political_labels, svmc)
training_id("political BNB ", political_labels, bnb)
training_id("political LR  ", political_labels, lr)
training_id("political RF  ", political_labels, rf)
training_id("ethnic SVM ", ethnic_labels, svmc)
training_id("ethnic BNB ", ethnic_labels, bnb)
training_id("ethnic LR  ", ethnic_labels, lr)
training_id("ethnic RF  ", ethnic_labels, rf)
training_id("stigma SVM ", stigma_labels, svmc)
training_id("stigma BNB ", stigma_labels, bnb)
training_id("stigma LR  ", stigma_labels, lr)
training_id("stigma RF  ", stigma_labels, rf)

##### Combined features using tfidf, chi2 or information gain feature selection and latent sematic analysis
Combined features do not significantly improve the Chi2 performance

In [None]:
lb = stigma_labels
svm_l_stigma = svm.LinearSVC(C=0.5,class_weight="balanced",max_iter=2000)

feature_union = FeatureUnion(
    [('description', Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('best', SelectKBest(chi2,k=1200)),
        #('best',SelectFromModel(tree.DecisionTreeClassifier(criterion="entropy")))
    ])),
    ('lsa', Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('best', TruncatedSVD(n_components=10)),
    ]))
])
combined_features = feature_union.fit_transform(processed_features,lb)

print ("-----SUPPORT VECTOR MACHINE-----")
svm_l = svm.LinearSVC(C=0.5,class_weight="balanced",max_iter=1000)
evaluation_results = crossValidate(combined_features,lb,svm_l_stigma,nfold=10)
print ("Precision: {0}\nRecall: {1}\nF-score: {2}\n".format(*evaluation_results))

print ("-----BERNOULLI NAIVE BAYES-----")
evaluation_results = crossValidate(combined_features,lb,bnb,nfold=10)
print ("Precision: {0}\nRecall: {1}\nF-score: {2}\n".format(*evaluation_results))

print ("-----LOGISTIC REGRESSION-----")
evaluation_results = crossValidate(combined_features,lb,lr,nfold=10)
print ("Precision: {0}\nRecall: {1}\nF-score: {2}\n".format(*evaluation_results))

print ("-----RANDOM FOREST-----")
evaluation_results = crossValidate(combined_features,lb,rf,nfold=10)
print ("Precision: {0}\nRecall: {1}\nF-score: {2}\n".format(*evaluation_results))

The code below will perform feature selection using Chi^2 to learn 1000 highly informative features, transform the test set, 
train the model on training set and calculate precision, recall and F-score.

### Identity Classes Training and Testing

In [None]:
def test_id(name, lb, clf, test_lb):
    selected_dtm, chi2_model = featuresByChiSq(document_term_matrix, lb, 1000)
    test_dtm = chi2_model.transform(tfidf_vectorizer.transform(test_processed_features))

    clf.fit(selected_dtm, lb)
    prediction = clf.predict(test_dtm)

    p = precision_score(test_lb, prediction)
    r = recall_score(test_lb, prediction)
    f = 2*p*r / (p+r)
    cm = confusion_matrix(test_lb, prediction, labels=[1,0])

    print (name)
    print ("{0}\t {1}\t {2}".format(p,r,f))
    #print ("Confusion Matrix\n")
    #print_cm(cm, [[1],[0]])

# ALSO ABOVE
lr = LogisticRegression(class_weight="balanced")
bnb = BernoulliNB()
rf = RandomForestClassifier(n_estimators = 100,class_weight="balanced_subsample")
svmc = svm.LinearSVC(C=0.5,class_weight="balanced",max_iter=2000)

test_id("Relational SVM", relational_labels, svmc, testing_relational_labels)
test_id("Relational BNB",  relational_labels, bnb, testing_relational_labels)
test_id("Relational LR",  relational_labels, lr, testing_relational_labels)
test_id("Relational RF",  relational_labels, rf,  testing_relational_labels)
test_id("Occupation SVM",  occupation_labels, svmc, testing_occupation_labels)
test_id("Occupation BNB",  occupation_labels, bnb, testing_occupation_labels)
test_id("Occupation LR",  occupation_labels, lr, testing_occupation_labels)
test_id("Occupation RF",  occupation_labels, rf, testing_occupation_labels)
test_id("Political SVM",  political_labels, svmc, testing_political_labels)
test_id("Political BNB",  political_labels, bnb, testing_political_labels)
test_id("Political LR",  political_labels, lr, testing_political_labels)
test_id("Political RF ",  political_labels,  rf, testing_political_labels)
test_id("Ethnic SVM",  ethnic_labels, svmc, testing_ethnic_labels)
test_id("Ethnic BNB",  ethnic_labels, bnb, testing_ethnic_labels)
test_id("Ethnic LR",  ethnic_labels, lr, testing_ethnic_labels)
test_id("Ethnic RF ",  ethnic_labels,  rf, testing_ethnic_labels)
test_id("Stigma SVM",  stigma_labels, svmc, testing_stigma_labels)
test_id("Stigma BNB",  stigma_labels, bnb, testing_stigma_labels)
test_id("Stigma LR",  stigma_labels, lr, testing_stigma_labels)
test_id("Stigma RF ",  stigma_labels,  rf, testing_stigma_labels)


--------------------------------------------------------------------------------------------------------------------------------