In [51]:
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [52]:
df = pd.read_csv('Dataset/2500_training_data.csv')
X,y = df['tweet'],df['label']
print X.shape, y.shape

(2195,) (2195,)


In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state = 45)

# Feature Extraction (TF-IDF) unigrams and bigrams

In [54]:
#   - Filters out terms that occur in only one document (min_df=2).
vectorizer = TfidfVectorizer(use_idf=True, ngram_range=(1,2),min_df=2)
X = vectorizer.fit_transform(X_train) 
train_tfidf_feature = X.toarray()
print train_tfidf_feature
print train_tfidf_feature.shape

[[ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 ..., 
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.17981697  0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]]
(1536, 2759)


# Feature Extraction (LSA)
Latent Semantic Analysis is a technique that analyzes relationship between a set of documents and of the terms. This method will extract contextual-usage meaning of words by statistical computations applied to a large corpus of text. 

Input: X[m][n] -> a matrix where m is the number of documents and n is the number of terms. 

The matrix X will be decomposed into three matrices called the U, S, and T. In doing the decomposition, a value of <i>k</i> will have to be picked since it will represent the number of concepts kept. 

<center>$X\approx USV^{T} $</center>

U[m][k] where the rows are the documents and the columns will be the mathematical concepts.

S[k][k] is a diagonal matrix where elements will be the amount of variation captured from each concept. 

V[m][k] transpose where the rows will be terms and the columns will be concepts.

In [55]:
lsa = TruncatedSVD(n_components = 100, n_iter=500)
train_lsa_feature = lsa.fit_transform(X)
print train_lsa_feature.shape

(1536, 100)


In [56]:
# #### To view list of related concepts
# terms = vectorizer.get_feature_names()
# for i, comp in enumerate(lsa_feature):
#     termsInComp = zip (terms, comp)
#     sortedTerms = sorted(termsInComp, key=lambda x: x[1], reverse = True) [:10]
#     print "Concept %d:" % i
#     for term in sortedTerms:
#         print term[0]
#     print " "

# Incremental SVM Classifier

# For TFIDF Feature only.

In [57]:
clf = SGDClassifier(alpha=0.0001, average =False, epsilon = 0.1, 
                    fit_intercept = 2, loss="hinge", 
                    penalty="l2", shuffle=True)
clf.partial_fit(train_tfidf_feature, y_train,classes=[0, 1])

ValueError: classes should include all valid labels that can be in y

# For LSA Feature only.

In [None]:
clf2 = SGDClassifier(alpha=0.0001, average =False, epsilon = 0.1, 
                    fit_intercept = 2, loss="hinge", 
                    penalty="l2", shuffle=True)
clf2.partial_fit(train_lsa_feature, y_train,classes=[0, 1])

# Concatenating features

In [None]:
train_final_representation = np.concatenate((train_tfidf_feature,train_lsa_feature),axis=1)

In [None]:
clf3 = SGDClassifier(alpha=0.0001, average =False, epsilon = 0.1, 
                    fit_intercept = 2, loss="hinge", 
                    penalty="l2", shuffle=True)
clf3.partial_fit(train_final_representation,y_train,classes=[0,1])

# Testing Classifiers

# For TFIDF Feature only.

In [None]:
X2 = vectorizer.transform(X_test)
test_tfidf_feature = X2.toarray()
accuracy = clf.score(test_tfidf_feature, y_test) * 100
prediction = clf.predict(test_tfidf_feature)
print "Accuracy: ", accuracy
# print clf.coef_

# For LSA Feature Only.

In [None]:
test_lsa_feature = lsa.transform(X2)
accuracy2 = clf2.score(test_lsa_feature, y_test) * 100
prediction2 = clf2.predict(test_lsa_feature)
print "Accuracy: ",accuracy2

# For Concatenated

In [None]:
test_final_representation = np.concatenate((test_tfidf_feature,test_lsa_feature),axis=1)
accuracy3 = clf3.score(test_final_representation,y_test) *100
prediction3 = clf3.predict(test_final_representation)
print "Accuracy: ",accuracy3
# print "Prediction: ",prediction3

In [None]:
print test_final_representation

In [None]:
cf1 = confusion_matrix(y_test,prediction)
print "CONFUSION MATRICES:"
print    "------------------------"
print "TFIDF: "
print cf1

cf2 = confusion_matrix(y_test,prediction2)
print "LSA: "
print cf2

cf3 = confusion_matrix(y_test,prediction3)
print "CONCATENATED: "
print cf3


In [None]:
print "F-SCORES:"
print"--------------"
print "TFIDF: "
print f1_score(y_test,prediction, average=None)

print "LSA: "
print f1_score(y_test,prediction2, average=None)

print "CONCATENATED: "
print f1_score(y_test,prediction3, average=None)