In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import classification_report

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

In [2]:
train = pd.read_csv("./data/trainP.csv")

# Splitting data
X_train, X_test, Y_train, Y_test = train_test_split(train["TweetText"], train["Label"], test_size=0.2, random_state=42, shuffle=True)

# Train models

## Logistic Regression

In [3]:
lr = Pipeline([('tfidf', TfidfVectorizer()),('lr', LogisticRegression()),])
lr.fit(X_train, Y_train)
test_predict = lr.predict(X_test)
lr_tra = lr.score(X_train,Y_train)
lr_ta = accuracy_score(test_predict, Y_test)
print(classification_report(Y_test, test_predict, target_names=("Politics", "Sports"), digits=3))

              precision    recall  f1-score   support

    Politics      0.954     0.947     0.950       638
      Sports      0.949     0.957     0.953       667

    accuracy                          0.952      1305
   macro avg      0.952     0.952     0.952      1305
weighted avg      0.952     0.952     0.952      1305



## SGD Classifier

In [4]:
sgd = Pipeline([('tfidf', TfidfVectorizer()),('sgd', SGDClassifier()),])
sgd.fit(X_train, Y_train)
test_predict = sgd.predict(X_test)
sgd_tra = sgd.score(X_train,Y_train)
sgd_ta = accuracy_score(test_predict, Y_test)
print(classification_report(Y_test, test_predict, target_names=("Politics", "Sports"), digits=3))

              precision    recall  f1-score   support

    Politics      0.954     0.948     0.951       638
      Sports      0.951     0.957     0.954       667

    accuracy                          0.952      1305
   macro avg      0.953     0.952     0.952      1305
weighted avg      0.953     0.952     0.952      1305



## RandomForest Classifier

In [5]:
rf =  Pipeline([('tfidf', TfidfVectorizer()),('rf', RandomForestClassifier()),])
rf.fit(X_train, Y_train)
test_predict = rf.predict(X_test)
rf_tra = rf.score(X_train,Y_train)
rf_ta = accuracy_score(test_predict, Y_test)
print(classification_report(Y_test, test_predict, target_names=("Politics", "Sports"), digits=3))

              precision    recall  f1-score   support

    Politics      0.953     0.887     0.919       638
      Sports      0.899     0.958     0.927       667

    accuracy                          0.923      1305
   macro avg      0.926     0.923     0.923      1305
weighted avg      0.925     0.923     0.923      1305



## Gradient Boosting Classfier

In [6]:
gdb =  Pipeline([('tfidf', TfidfVectorizer()),('gdb', GradientBoostingClassifier()),])
gdb.fit(X_train, Y_train)
test_predict = gdb.predict(X_test)
gdb_tra = gdb.score(X_train,Y_train)
gdb_ta = accuracy_score(test_predict, Y_test)
print(classification_report(Y_test, test_predict, target_names=("Politics", "Sports"), digits=3))

              precision    recall  f1-score   support

    Politics      0.956     0.677     0.793       638
      Sports      0.758     0.970     0.851       667

    accuracy                          0.827      1305
   macro avg      0.857     0.824     0.822      1305
weighted avg      0.855     0.827     0.823      1305



## KNeighborsClassifier

In [7]:
knn =  Pipeline([('tfidf', TfidfVectorizer()),('knn', KNeighborsClassifier(1)),])
knn.fit(X_train, Y_train)
test_predict = knn.predict(X_test)
knn_tra = knn.score(X_train,Y_train)
knn_ta = accuracy_score(test_predict, Y_test)
print(classification_report(Y_test, test_predict, target_names=("Politics", "Sports"), digits=3))

              precision    recall  f1-score   support

    Politics      0.935     0.926     0.931       638
      Sports      0.930     0.939     0.934       667

    accuracy                          0.933      1305
   macro avg      0.933     0.932     0.933      1305
weighted avg      0.933     0.933     0.933      1305



# Evaluate models

In [8]:
print("|              |        Logistic Regression       |            SGD            |        RandomForest              |    Gradient Boosting      |       KNeighbors          |")
print("| Dataset Type | Train Score     | Test Score     |Train Score   | Test Score | Train Score     | Test Score     |Train Score   | Test Score |Train Score   | Test Score |")
print("| Score        | {:.4f}          | {:.4f}         | {:.4f}       | {:.4f}     | {:.4f}          | {:.4f}         | {:.4f}       | {:.4f}     | {:.4f}       | {:.4f}     |".format(lr_tra, lr_ta, sgd_tra, sgd_ta, rf_tra, rf_ta, gdb_tra, gdb_ta, knn_tra, knn_ta))

|              |        Logistic Regression       |            SGD            |        RandomForest              |    Gradient Boosting      |       KNeighbors          |
| Dataset Type | Train Score     | Test Score     |Train Score   | Test Score | Train Score     | Test Score     |Train Score   | Test Score |Train Score   | Test Score |
| Score        | 0.9893          | 0.9517         | 1.0000       | 0.9525     | 1.0000          | 0.9234         | 0.8617       | 0.8268     | 1.0000       | 0.9326     |


# CrossValidation 

In [9]:
pipelines = [lr, sgd, rf, gdb, knn]
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
for model in pipelines:
    m_scores = cross_val_score(model, X_train, Y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    print(f'{model} accuracy: %.3f (%.3f)' % (np.mean(m_scores), np.std(m_scores)))

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('lr', LogisticRegression())]) accuracy: 0.941 (0.010)
Pipeline(steps=[('tfidf', TfidfVectorizer()), ('sgd', SGDClassifier())]) accuracy: 0.951 (0.009)
Pipeline(steps=[('tfidf', TfidfVectorizer()), ('rf', RandomForestClassifier())]) accuracy: 0.920 (0.011)
Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('gdb', GradientBoostingClassifier())]) accuracy: 0.837 (0.014)
Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('knn', KNeighborsClassifier(n_neighbors=1))]) accuracy: 0.919 (0.010)


# Selection 

We select the SGD Classifier as the better model for our task.
Let's now try it on unlabeled data.

In [10]:
test = pd.read_csv("./data/testP.csv")
print(test)

                 TweetId                                          TweetText
0     306486520121012224  home stab bennett wide get threaten 28 throw y...
1     286353402605228032  series httptcoygjepjkf mass pound avoirdupois ...
2     289531046037438464  sochi2014 construction shore along httptco8dvi...
3     306451661403062273  httptco4qx0fhypmp foreign video transcript htt...
4     297941800658812928  player first usd ricky go 400000 iplauction in...
...                  ...                                                ...
2605  282023761044189184  eliminator chidambaram home runnersup played q...
2606  303879735006601216  reesedward 16 peacekeeping here httptcoex2tad8...
2607  297956846046703616  first ipl sunrisersipl purchase iplauction perera
2608  304265049537658880  seckerry thanks welcome statedept4us reprobert...
2609  306430391928115200  account here picture room staff httptcohdf9uwn...

[2610 rows x 2 columns]


In [11]:
test_predict = sgd.predict(test["TweetText"].apply(lambda x: np.str_(x)))
test["Label"] = test_predict

In [12]:
test.head(20)

Unnamed: 0,TweetId,TweetText,Label
0,306486520121012224,home stab bennett wide get threaten 28 throw y...,Sports
1,286353402605228032,series httptcoygjepjkf mass pound avoirdupois ...,Sports
2,289531046037438464,sochi2014 construction shore along httptco8dvi...,Politics
3,306451661403062273,httptco4qx0fhypmp foreign video transcript htt...,Politics
4,297941800658812928,player first usd ricky go 400000 iplauction in...,Sports
5,305722428531802112,fabiano viswanathan draw caruana httptco35gg3n...,Sports
6,304713516256997377,lfctvliverpoolfccom tweet u lfctv tonight emai...,Sports
7,234999630725783553,begin paralympic olympics follow sure latest u...,Politics
8,303712268372283392,thanks big compliment richaanirudh,Sports
9,304215754130194432,espargarxf3 polespargaro two nhttptcocubrd27q ...,Sports


In [13]:
test = test.drop("TweetText",1)
test.to_csv('./data/submission.csv', index=False)

from pickle import dump
dump(lr, open('lr.pkl', 'wb'))
dump(sgd, open('sgd.pkl', 'wb'))
dump(rf, open('rf.pkl', 'wb'))
dump(gdb, open('gdb.pkl', 'wb'))
dump(knn, open('knn.pkl', 'wb'))

  """Entry point for launching an IPython kernel.
