In [1]:
import numpy as np
import pandas as pd
import os
import sys

from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.feature_extraction.text import TfidfVectorizer

# Feature Reduction : Some of those are linear discriminant analysis, autoencoders, non-negative matrix factorization, and principal component analysis, Truncated Singular Value Decomposition.

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/laxmimerit/twitter-data/master/twitter30k_cleaned.csv')
df

Unnamed: 0,twitts,sentiment
0,robbiebronniman sounds like a great night,1
1,damn the person who stolde my wallet may karma...,1
2,greetings from the piano bench photo,1
3,drewryanscott i love it i love you haha forget...,1
4,kissthestars pretty pretty pretty please pakid...,0
...,...,...
29995,calumfan1 is it in any way related to photoshop,0
29996,swiz_nz really wow thats crap,0
29997,at the 2010 lexus hs250h press event again can...,0
29998,karmicunderpath ooh now there is a nice thought,1


In [3]:
df['sentiment'].value_counts()

0    15000
1    15000
Name: sentiment, dtype: int64

In [4]:
X = df['twitts']
y = df['sentiment']

In [5]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(X)

In [6]:
350520/(X.shape[0]*X.shape[1])

0.00027735840098751366

In [7]:
len(tfidf.vocabulary_)

42126

In [8]:
X.shape, y.shape

((30000, 42126), (30000,))

In [9]:
type(X), type(y)

(scipy.sparse.csr.csr_matrix, pandas.core.series.Series)

In [10]:
sys.getsizeof(X)

64

# Non-Negative Matrix Factorization (NMF)¶


In [11]:
from sklearn.decomposition import NMF

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0, stratify = y)

In [13]:
%%time
nmf = NMF(n_components=50, random_state=0)
X_train_nmf = nmf.fit_transform(X_train)



CPU times: user 1min 24s, sys: 13.5 s, total: 1min 37s
Wall time: 21.3 s


In [20]:
def run_svm(clf, X_train, X_test, y_train, y_test):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    print()
    print('Printing Report')
    print(classification_report(y_test, y_pred))

In [15]:
%%time
clf = LinearSVC()

X_test_nmf = nmf.transform(X_test)

run_svm(clf, X_train_nmf, X_test_nmf, y_train, y_test)


Printing Report
              precision    recall  f1-score   support

           0       0.66      0.62      0.64      3000
           1       0.64      0.68      0.66      3000

    accuracy                           0.65      6000
   macro avg       0.65      0.65      0.65      6000
weighted avg       0.65      0.65      0.65      6000

CPU times: user 1.1 s, sys: 214 ms, total: 1.32 s
Wall time: 229 ms


# Truncated Singular Value Decomposition (TSVD)¶

In [16]:
from sklearn.decomposition import TruncatedSVD as TSVD

In [17]:
%%time
tsvd = TSVD(n_components=500, random_state=0)
X_train_tsvd = tsvd.fit_transform(X_train)

CPU times: user 42.6 s, sys: 5.11 s, total: 47.7 s
Wall time: 8.88 s


In [18]:
sum(tsvd.explained_variance_)

0.3824186985646074

In [21]:
%%time
clf = LinearSVC()

X_test_tsvd = tsvd.transform(X_test)

run_svm(clf, X_train_tsvd, X_test_tsvd, y_train, y_test)


Printing Report
              precision    recall  f1-score   support

           0       0.75      0.72      0.73      3000
           1       0.73      0.76      0.74      3000

    accuracy                           0.74      6000
   macro avg       0.74      0.74      0.74      6000
weighted avg       0.74      0.74      0.74      6000

CPU times: user 1.75 s, sys: 40.4 ms, total: 1.79 s
Wall time: 1.7 s


In [23]:
d = (X.data.nbytes + X.indptr.nbytes + X.indices.nbytes)

In [24]:
d

4326244

In [25]:
(X.shape[0] * X.shape[1])*8/(2**20)

9641.876220703125

In [26]:
sys.getsizeof(X_train_tsvd)

96000120

In [27]:
sys.getsizeof(X_train_tsvd)/(2**20)

91.55284881591797