In [118]:
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.utils import compute_class_weight
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [119]:
df = pd.read_csv('Training_Dataset/2500_training_data_2.csv')
X,y = df['tweet'],df['label']
print X.shape, y.shape

(2513,) (2513,)


In [120]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 45)
# print X_train.shape
# print X_train

# Feature Extraction (TF-IDF) unigrams and bigrams

In [121]:
#   - Filters out terms that occur in only one document (min_df=2).
vectorizer = TfidfVectorizer(use_idf=True, ngram_range=(1,2),min_df=2)
X = vectorizer.fit_transform(X_train) 
train_tfidf_feature = X.toarray()
# print train_tfidf_feature
# print train_tfidf_feature.shape

# Feature Extraction (LSA)
Latent Semantic Analysis is a technique that analyzes relationship between a set of documents and of the terms. This method will extract contextual-usage meaning of words by statistical computations applied to a large corpus of text. 

Input: X[m][n] -> a matrix where m is the number of documents and n is the number of terms. 

The matrix X will be decomposed into three matrices called the U, S, and T. In doing the decomposition, a value of <i>k</i> will have to be picked since it will represent the number of concepts kept. 

<center>$X\approx USV^{T} $</center>

U[m][k] where the rows are the documents and the columns will be the mathematical concepts.

S[k][k] is a diagonal matrix where elements will be the amount of variation captured from each concept. 

V[m][k] transpose where the rows will be terms and the columns will be concepts.

In [122]:
lsa = TruncatedSVD(n_components = 500, n_iter=500)
train_lsa_feature = lsa.fit_transform(X)
print train_lsa_feature.shape

(2010, 500)


In [123]:
# #### To view list of related concepts
# terms = vectorizer.get_feature_names()
# for i, comp in enumerate(train_lsa_feature):
#     termsInComp = zip (terms, comp)
#     sortedTerms = sorted(termsInComp, key=lambda x: x[1], reverse = True) [:10]
#     print "Concept %d:" % i
#     for term in sortedTerms:
#         print term[0]
#     print " "

# SVM Classifier

# Concatenating features

In [124]:
train_final_representation = np.concatenate((train_tfidf_feature,train_lsa_feature),axis=1)

In [125]:
class_weights = compute_class_weight('balanced', [0, 1], y)
class_weight_dictionary = {1:class_weights[0], 1:class_weights[1]}
clf3 = SGDClassifier(class_weight=class_weight_dictionary,loss="hinge",
                    penalty="l2", shuffle=True,)
# clf3 = SGDClassifier(class_weight=class_weight_dictionary)
clf3.partial_fit(train_final_representation,y_train,classes=[0,1])

SGDClassifier(alpha=0.0001, average=False,
       class_weight={1: 1.3945615982241952}, epsilon=0.1, eta0=0.0,
       fit_intercept=True, l1_ratio=0.15, learning_rate='optimal',
       loss='hinge', n_iter=5, n_jobs=1, penalty='l2', power_t=0.5,
       random_state=None, shuffle=True, verbose=0, warm_start=False)

# Testing Classifier

In [126]:
X2 = vectorizer.transform(X_test)
test_tfidf_feature = X2.toarray()
test_lsa_feature = lsa.transform(X2)

In [127]:
test_final_representation = np.concatenate((test_tfidf_feature,test_lsa_feature),axis=1)
accuracy3 = clf3.score(test_final_representation,y_test) *100
prediction3 = clf3.predict(test_final_representation)
print "Accuracy: ",accuracy3
# print "Prediction: ",prediction3

Accuracy:  73.1610337972


In [128]:
print test_final_representation

[[ 0.          0.          0.         ...,  0.03909189  0.01221405
  -0.0065787 ]
 [ 0.          0.          0.         ...,  0.00477606 -0.00495635
  -0.00140132]
 [ 0.          0.          0.         ...,  0.02636496 -0.06113918
   0.00441372]
 ..., 
 [ 0.          0.          0.         ...,  0.04607175 -0.02371487
  -0.01091672]
 [ 0.          0.          0.         ...,  0.00424238  0.03222609
  -0.04062164]
 [ 0.          0.          0.         ..., -0.01159308 -0.0289939
   0.01185795]]


In [129]:
cf3 = confusion_matrix(y_test,prediction3)
print "CONCATENATED: "
print cf3


CONCATENATED: 
[[239  84]
 [ 51 129]]


In [130]:
print "CONCATENATED: "
print f1_score(y_test,prediction3, average=None)

CONCATENATED: 
[ 0.77977162  0.65648855]


In [131]:
# save to pickle
import pickle

with open('Models/classifier.pkl', 'wb') as f:
    pickle.dump(clf3, f)
    
with open('Models/vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)
    
with open('Models/lsa.pkl', 'wb') as f:
    pickle.dump(lsa, f)
