In [116]:
# importing required libraries
import pandas as pd
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

In [260]:
data = pd.read_csv('train.csv', sep=',' )


In [261]:
data=data.drop(['id'], axis = 1)

In [262]:
data.head()

Unnamed: 0,label,tweet
0,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,0,Finally a transparant silicon case ^^ Thanks t...
2,0,We love this! Would you go? #talk #makememorie...
3,0,I'm wired I know I'm George I was made that wa...
4,1,What amazing service! Apple won't even talk to...


In [263]:
data.shape

(7920, 2)

In [264]:
data.describe()

Unnamed: 0,label
count,7920.0
mean,0.255808
std,0.436342
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [265]:
data.label.value_counts()

0    5894
1    2026
Name: label, dtype: int64

In [266]:
len(data)

7920

In [267]:
train, test = train_test_split(data, test_size = 0.3, stratify = data['label'], random_state=2)

In [272]:
train.label

2712    0
2619    0
5418    1
751     0
6436    1
       ..
2635    0
3500    0
4509    1
5058    1
1286    0
Name: label, Length: 5544, dtype: int64

In [269]:
train.label.value_counts(normalize=True)

0    0.744228
1    0.255772
Name: label, dtype: float64

In [270]:
test.label.value_counts(normalize=True)

0    0.744108
1    0.255892
Name: label, dtype: float64

In [198]:
tfidf_vectorizer = TfidfVectorizer(lowercase= True, max_features=1000, stop_words=ENGLISH_STOP_WORDS)

In [199]:
tfidf_vectorizer.fit(train.tweet)

TfidfVectorizer(max_features=1000,
                stop_words=frozenset({'a', 'about', 'above', 'across', 'after',
                                      'afterwards', 'again', 'against', 'all',
                                      'almost', 'alone', 'along', 'already',
                                      'also', 'although', 'always', 'am',
                                      'among', 'amongst', 'amoungst', 'amount',
                                      'an', 'and', 'another', 'any', 'anyhow',
                                      'anyone', 'anything', 'anyway',
                                      'anywhere', ...}))

In [200]:
train_idf = tfidf_vectorizer.transform(train.tweet)
test_idf  = tfidf_vectorizer.transform(test.tweet)

In [57]:
train_idf

<5940x18541 sparse matrix of type '<class 'numpy.float64'>'
	with 79988 stored elements in Compressed Sparse Row format>

In [201]:
model_LR = LogisticRegression()

In [202]:
model_LR.fit(train_idf, train.label)

LogisticRegression()

In [203]:
predict_train = model_LR.predict(train_idf)

In [204]:
predict_test = model_LR.predict(test_idf)

In [205]:
# f1 score on train data
f1_score(y_true= train.label, y_pred= predict_train)

0.8234448040273282

In [206]:
f1_score(y_true= test.label, y_pred= predict_test)

0.7652464494569757

In [207]:
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
Naive = naive_bayes.MultinomialNB()
Naive.fit(train_idf, train.label)
# predict the labels on validation dataset
predictions_NB = Naive.predict(test_idf)
# Use accuracy_score function to get the accuracy

print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, test.label)*100)

Naive Bayes Accuracy Score ->  88.42592592592592


In [208]:
f1_score(y_true= test.label, y_pred= predictions_NB)

0.7787610619469025

In [209]:
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(train_idf, train.label)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(test_idf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, test.label)*100)

SVM Accuracy Score ->  88.34175084175084


In [210]:
f1_score(y_true= test.label, y_pred= predictions_SVM)

0.7820613690007867

In [245]:
from sklearn.linear_model import SGDClassifier
text_clf_svm =  SGDClassifier(loss='hinge', penalty='l2',
                                            alpha=1e-3,  random_state=42)

text_clf_svm.fit(train_idf, train.label)
predicted_svm = text_clf_svm.predict(test_idf)
#np.mean(predicted_svm == train.label)
print("SVM Accuracy Score -> ",accuracy_score(predicted_svm, test.label)*100)

SVM Accuracy Score ->  88.93097643097643


In [246]:
f1_score(y_true= test.label, y_pred= predicted_svm)

0.790771678599841

In [258]:
from sklearn.model_selection import GridSearchCV
parameters = {
               'clf-svm__alpha': (1e-2, 1e-3)
 }


In [259]:
gs_clf = GridSearchCV(text_clf_svm, parameters, n_jobs=1)
gs_clf = gs_clf.fit(train_idf, train.label)

ValueError: Invalid parameter clf-svm for estimator SGDClassifier(alpha=0.001, random_state=42). Check the list of available parameters with `estimator.get_params().keys()`.

In [None]:
gs_clf.best_score_
gs_clf.best_params_

In [228]:
pipeline = Pipeline(steps= [('tfidf', TfidfVectorizer(lowercase=True,
                                                      max_features=1000,
                                                      stop_words= ENGLISH_STOP_WORDS)),
                            ('model', LogisticRegression())])

In [229]:
pipeline.fit(train.tweet, train.label)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_features=1000,
                                 stop_words=frozenset({'a', 'about', 'above',
                                                       'across', 'after',
                                                       'afterwards', 'again',
                                                       'against', 'all',
                                                       'almost', 'alone',
                                                       'along', 'already',
                                                       'also', 'although',
                                                       'always', 'am', 'among',
                                                       'amongst', 'amoungst',
                                                       'amount', 'an', 'and',
                                                       'another', 'any',
                                                       'anyhow', 'anyone',
           

In [230]:
pipeline.predict(train.tweet)

array([0, 0, 1, ..., 1, 1, 0], dtype=int64)

In [285]:
testdata = pd.read_csv('test.csv', sep=',' )

In [286]:
ftestdata=testdata.drop(['id'], axis = 1)

In [287]:
ftestdata.tweet

0       I hate the new #iphone upgrade. Won't let me d...
1       currently shitting my fucking pants. #apple #i...
2       I'd like to puts some CD-ROMS on my iPad, is t...
3       My ipod is officially dead. I lost all my pict...
4       Been fighting iTunes all night! I only want th...
                              ...                        
1948    #SamsungGalaxyNote7 Explodes, Burns 6-Year-Old...
1949    Now Available - Hoodie. Check it out here - ht...
1950    There goes a crack right across the screen. If...
1951    @codeofinterest as i said #Adobe big time we m...
1952    Finally I got it .. thanx my father .. #Samsun...
Name: tweet, Length: 1953, dtype: object

In [288]:
lr1_val_predict = lr1.predict(ftestdata.tweet)

In [282]:
tfidf_vectorizer.fit(ftestdata.tweet)

TfidfVectorizer(max_features=1000,
                stop_words=frozenset({'a', 'about', 'above', 'across', 'after',
                                      'afterwards', 'again', 'against', 'all',
                                      'almost', 'alone', 'along', 'already',
                                      'also', 'although', 'always', 'am',
                                      'among', 'amongst', 'amoungst', 'amount',
                                      'an', 'and', 'another', 'any', 'anyhow',
                                      'anyone', 'anything', 'anyway',
                                      'anywhere', ...}))

In [215]:
val_idf  = tfidf_vectorizer.transform(ftestdata.tweet)

In [216]:
val_idf

<1953x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 16890 stored elements in Compressed Sparse Row format>

In [217]:
test_idf

<2376x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 20330 stored elements in Compressed Sparse Row format>

In [218]:
predict_test = model_LR.predict(val_idf)

In [253]:
predict_test_svm = text_clf_svm.predict(val_idf)

In [220]:
predict_test_naive = Naive.predict(val_idf)

In [289]:
testdata['label']=lr1_val_predict

In [290]:
submission = testdata[['id','label']]

submission.to_csv("submission_countvect.csv", index=False)
submission.tail()

Unnamed: 0,id,label
1948,9869,0
1949,9870,0
1950,9871,1
1951,9872,1
1952,9873,0


In [24]:
from joblib import dump

In [25]:
dump(pipeline, filename="text_classification.joblib")

['text_classification.joblib']

In [26]:
data[data.label == 1]

Unnamed: 0,id,label,tweet
4,5,1,What amazing service! Apple won't even talk to...
5,6,1,iPhone software update fucked up my phone big ...
10,11,1,hey #apple when you make a new ipod dont make ...
11,12,1,Ha! Not heavy machinery but it does what I nee...
12,13,1,Contemplating giving in to the iPhone bandwago...
...,...,...,...
7901,7902,1,@brendan_brady1: The new apple advert can fuck...
7902,7903,1,All I wanna do is put music from my iTunes on ...
7904,7905,1,#FuckYou #Apple one thing stops working and I ...
7908,7909,1,SECOND FUCKING TIME. Stay on hold for 30 mins ...
