In [None]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier

df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_train.drop_duplicates(subset=['tweet'], keep='last', inplace=True)
df_train.reset_index(inplace=True)
print("Shape of Train set after removing duplicates:", df_train.shape)
df_offensive =pd.read_csv("labeled_data.csv")
df_offensive["class"].replace({0: 1}, inplace=True)
df_offensive["class"].replace({2: 0}, inplace=True)
df_offensive.drop(['Unnamed: 0','count','hate_speech','offensive_language','neither'],axis=1,inplace=True)
df_offensive.rename(columns ={'class':'label'}, inplace = True)
df_train_final = pd.concat([df_train,df_offensive])
df_train_final.drop(['id'],axis=1,inplace=True)


In [None]:
df_train_final

In [3]:
import re
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def text_cleaner(input_text):

    input_text = input_text.lower() 
    input_text = re.sub(r'@[A-Za-z0-9_]+','',str(input_text))    # Removing @mentions
    input_text = re.sub(r'#','',str(input_text))                 # Removing #tag symbol
    input_text = re.sub(r'RT[\s]+',' ',input_text)   
    input_text = REPLACE_BY_SPACE_RE.sub(' ', input_text)
    input_text = BAD_SYMBOLS_RE.sub('', input_text)  
    input_text = input_text.replace('x', '')
    input_text = ' '.join(word for word in input_text.split() if word not in STOPWORDS) 
    return input_text

In [4]:
df_train_final['tweet']=df_train_final['tweet'].apply(text_cleaner)

In [5]:
df_train_final.shape

(54313, 3)

In [6]:
X = df_train_final['tweet'].astype(str)  # Converting to string, because vectorizer does'nt accept list.
y = df_train_final['label'].astype(str)  # Converting to string, because vectorizer does'nt accept list.
X_train, X_test, y_train, y_test =  train_test_split(X, y, train_size = 0.8, random_state = 3) 
df_train_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 54313 entries, 0 to 24782
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   index   29530 non-null  float64
 1   label   54313 non-null  int64  
 2   tweet   54313 non-null  object 
dtypes: float64(1), int64(1), object(1)
memory usage: 1.7+ MB


In [7]:
# Extracting features using TF-IDF (1,2) - unigrams and bigrams
vectoriser = TfidfVectorizer(ngram_range=(1,2), max_features=500000)
vectoriser.fit(X_train)
print('No. of feature_words: ', len(vectoriser.get_feature_names()))

No. of feature_words:  281664




In [8]:
X_train = vectoriser.transform(X_train)
X_test  = vectoriser.transform(X_test)

In [None]:
model_svc = SVC(C=1000, gamma= 0.001, kernel='rbf').fit(X_train, y_train)
prediction2=model_svc.predict(X_test)
print(confusion_matrix(y_test,prediction2))
print (metrics.classification_report(y_test, prediction2))

In [None]:
import pickle 
pickle.dump(model_svc, open('SVC_test_model.sav', 'wb'))



In [None]:
loaded_model = pickle.load(open('SVC_test_model.sav', 'rb'))

In [125]:



test = 'I want to fuck with you even if you are underage.'
test= [text_cleaner(test)]
print('Input text:', [test])
test_vect = vectoriser.transform(test)
pred = loaded_model.predict(test_vect)
print("pred=", pred)
if (pred=='1'):
    print('Text falls under hate and abusive category')
else:
    print('Text is safe.')

Input text: [['want fuck even underage']]
pred= ['0']
Text is safe.


In [118]:
model_1= MultinomialNB().fit(X_train, y_train)
prediction1=model1.predict(X_test)
print(confusion_matrix(y_test,prediction1))
print (metrics.classification_report(y_test, prediction1))

[[5985  394]
 [ 474 4010]]
              precision    recall  f1-score   support

           0       0.93      0.94      0.93      6379
           1       0.91      0.89      0.90      4484

    accuracy                           0.92     10863
   macro avg       0.92      0.92      0.92     10863
weighted avg       0.92      0.92      0.92     10863



In [59]:
pickle.dump(model_1, open('test_model_multinomial.sav', 'wb'))

In [63]:
loaded_model_1 = pickle.load(open('test_model_multinomial.sav', 'rb'))


In [128]:

test = ''
test= [text_cleaner(test)]
print('Input text:', [test])
test_vect = vectoriser.transform(test)
pred = loaded_model_1.predict(test_vect)
print("pred=", pred)
if (pred=='1'):
    print('Text falls under hate and abusive category')
else:
    print('Text is safe.')

Input text: [['want fuck']]
pred= ['1']
Text falls under hate and abusive category


In [76]:
loaded_model_M = pickle.load(open('final_model_multinomial.sav', 'rb'))
loaded_model_SVC = pickle.load(open('final_model_SVC.sav', 'rb'))
loaded_model_Logistic= pickle.load(open('final_model_Logistic.sav', 'rb'))
loaded_model_RF= pickle.load(open('final_model_RandomForest.sav', 'rb'))
loaded_model_DT= pickle.load(open('final_model_DecisionTreeClassifier.sav', 'rb'))

In [77]:
test = 'I like you'
test= [text_cleaner(test)]
print('Input text:', [test])
test_vect = vectoriser.transform(test)
pred1 = loaded_model_M.predict(test_vect)
pred2 = loaded_model_SVC.predict(test_vect)
pred3 = loaded_model_Logistic.predict(test_vect)
pred4 = loaded_model_RF.predict(test_vect)
pred5 = loaded_model_DT.predict(test_vect)
print('\nMultinomial model ')
if (pred1=='1'):
    print('  Text falls under hate and abusive category')
else:
    print('  Text is safe.')

print('\nSVC model ')
if (pred2=='1'):
    print('   Text falls under hate and abusive category')
else:
    print('   Text is safe.')
print('\nLogistic Regression ')
if (pred3=='1'):
    print('   Text falls under hate and abusive category')
else:
    print('   Text is safe.')    

print('\nRandom Forest ')

if (pred4=='1'):
    print('   Text falls under hate and abusive category')
else:
    print('   Text is safe.') 
    
print('\nDecision Tree Classifier ')
if (pred5=='1'):
    print('  Text falls under hate and abusive category')
else:
    print('  Text is safe.')  

Input text: [['like']]


ValueError: X has 281664 features, but MultinomialNB is expecting 339477 features as input.