In [1]:
# Model test (SVM)

import numpy as np
import pandas as pd
from nltk.stem import PorterStemmer
import re
# import string
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import warnings
warnings.filterwarnings("ignore")

def remove_pattern(input_txt,pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
    return input_txt

In [2]:
data = pd.read_csv('../train.csv')
data = data.drop(['id'], axis = 1)

data['tidy_tweet'] = np.vectorize(remove_pattern)(data['tweet'], "@[\w]*")
data['tidy_tweet'] = data['tidy_tweet'].str.replace('[^a-zA-Z]', " ")

tokenized_tweet = data['tidy_tweet'].apply(lambda x : x.split())
pstem = PorterStemmer()
tokenized_tweet = tokenized_tweet.apply(lambda x : [pstem.stem(i) for i in x])

for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = ' '.join(tokenized_tweet[i])
data['tidy_tweet'] = tokenized_tweet

from sklearn.utils import resample
majority = data[data.label == 0]
minority = data[data.label == 1]
 
minority_upsampled = resample(minority, 
                                 replace = True,
                                 n_samples = len(majority),  
                                 random_state = 123)
 
data = pd.concat([majority, minority_upsampled])

x_train, x_test, y_train, y_test = train_test_split(data['tidy_tweet'], data['label'], test_size = 0.1, random_state = 42)

# using countvectorizer

In [3]:
# using CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words = 'english')
train = cv.fit_transform(x_train)
test = cv.transform(x_test)

In [4]:
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
sv = SVC()

sv.fit(train, y_train)
predicted = sv.predict(test)
print(accuracy_score(predicted, y_test))
r=confusion_matrix(y_test, predicted)
print(r)
print("True Positive Rate (TPR) :", (r[0][0] / (r[0][0] + r[1][0])) * 100)
print("True Negative Rate (TNR) :", (r[1][1] / (r[1][1] + r[0][1])) * 100)
print("False Negative Rate (FNR) :", (r[1][0] / (r[1][0] + r[0][0])) * 100)
print("False Positive Rate (FPR) :", (r[0][1] / (r[0][1] + r[1][1])) * 100)
c=(classification_report(y_test, predicted))
print(c)

0.9931022880215343
[[2947   30]
 [  11 2956]]
True Positive Rate (TPR) : 99.62812711291413
True Negative Rate (TNR) : 98.99531145344943
False Negative Rate (FNR) : 0.37187288708586885
False Positive Rate (FPR) : 1.0046885465505693
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      2977
           1       0.99      1.00      0.99      2967

    accuracy                           0.99      5944
   macro avg       0.99      0.99      0.99      5944
weighted avg       0.99      0.99      0.99      5944



# using TfidfVectorizer

In [5]:
# using TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(stop_words = 'english')
train1 = tf.fit_transform(x_train)  #train the vectorizer, build the vocablury
test1 = tf.transform(x_test)  #get same encodings on test data as of vocabulary built

In [6]:
sv.fit(train1, y_train)
predicted1 = sv.predict(test1)
print(accuracy_score(predicted1, y_test))
r=confusion_matrix(y_test, predicted1)
print(r)
print("True Positive Rate (TPR) :", (r[0][0] / (r[0][0] + r[1][0])) * 100)
print("True Negative Rate (TNR) :", (r[1][1] / (r[1][1] + r[0][1])) * 100)
print("False Negative Rate (FNR) :", (r[1][0] / (r[1][0] + r[0][0])) * 100)
print("False Positive Rate (FPR) :", (r[0][1] / (r[0][1] + r[1][1])) * 100)
c=(classification_report(y_test, predicted))
print(c)

0.995121130551817
[[2948   29]
 [   0 2967]]
True Positive Rate (TPR) : 100.0
True Negative Rate (TNR) : 99.0320427236315
False Negative Rate (FNR) : 0.0
False Positive Rate (FPR) : 0.9679572763684913
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      2977
           1       0.99      1.00      0.99      2967

    accuracy                           0.99      5944
   macro avg       0.99      0.99      0.99      5944
weighted avg       0.99      0.99      0.99      5944

