In [2]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, f1_score

## Load Dataset

In [3]:
directory = '../data/bot_detection/'
train = pd.read_csv(directory + "train.csv", header=None)
test = pd.read_csv(directory + "test.csv", header=None)

train = pd.DataFrame({
    'id':range(len(train)),
    'label':train[0],
    'mark':['a']*train.shape[0],
    'text': train[1].replace(r'\n', ' ', regex=True)
})

test = pd.DataFrame({
    'id':range(len(test)),
    'label':test[0],
    'mark':['a']*test.shape[0],
    'text': test[1].replace(r'\n', ' ', regex=True)
})

train.columns = ["index", "label", "mark", "tweet"]
test.columns =  ["index", "label", "mark", "tweet"]

In [4]:
train_sentences = train.tweet.values
train_labels = train.label.values
test_sentences = test.tweet.values
test_labels = test.label.values

In [5]:
tuned_parameters = [{'clf__C': [1, 10, 100, 1000]}]
textclf = Pipeline([('tfidf', TfidfVectorizer(stop_words=stop_words)), ('clf', LinearSVC())])
clf = GridSearchCV(textclf, tuned_parameters, scoring='f1_macro', n_jobs = -1)
clf.fit(train_sentences, train_labels)
print(": Best parameters set found on development set:")
print(clf.best_params_)
# predict the labels on validation dataset


: Best parameters set found on development set:
{'clf__C': 1}


In [49]:
predictions = clf.predict(test_sentences)
print(classification_report(test_labels, predictions, digits = 4))
# Use accuracy_score function to get the accuracy
acc = accuracy_score(test_labels, predictions)*100
f1 = f1_score(test_labels, predictions)*100
print("SVM: ", acc)
print("SVM: ", f1)

              precision    recall  f1-score   support

           0     0.8264    0.8185    0.8224     55712
           1     0.7744    0.7837    0.7790     44288

    accuracy                         0.8031    100000
   macro avg     0.8004    0.8011    0.8007    100000
weighted avg     0.8033    0.8031    0.8032    100000

SVM:  80.306
SVM:  77.89872963145845


In [52]:
pd.DataFrame(clf.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__C,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,18.985604,0.665977,5.49897,0.080077,1,{'clf__C': 1},0.791132,0.790996,0.790571,0.7909,0.000239,1
1,88.793021,10.870773,7.315534,0.707599,10,{'clf__C': 10},0.774503,0.774175,0.773177,0.773952,0.000564,2
2,154.858631,8.297325,5.433154,1.490428,100,{'clf__C': 100},0.737833,0.739435,0.736937,0.738068,0.001033,3
3,150.103343,21.437671,2.608736,0.42776,1000,{'clf__C': 1000},0.71877,0.720953,0.721344,0.720356,0.001132,4
