In [40]:
%matplotlib inline

import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords

df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_train.drop_duplicates(subset=['tweet'], keep='last', inplace=True)
df_train.reset_index(inplace=True)
print("Shape of Train set after removing duplicates:", df_train.shape)
df_offensive =pd.read_csv("labeled_data.csv")
df_offensive["class"].replace({0: 1}, inplace=True)
df_offensive["class"].replace({2: 0}, inplace=True)
df_offensive.drop(['Unnamed: 0','count','hate_speech','offensive_language','neither'],axis=1,inplace=True)
df_offensive.rename(columns ={'class':'label'}, inplace = True)
df_train_final = pd.concat([df_train,df_offensive])
df_train_final.drop(['id'],axis=1,inplace=True)


Shape of Train set after removing duplicates: (29530, 4)


In [41]:
df_train_final

Unnamed: 0,index,label,tweet
0,0.0,0,@user when a father is dysfunctional and is s...
1,1.0,0,@user @user thanks for #lyft credit i can't us...
2,2.0,0,bihday your majesty
3,4.0,0,factsguide: society now #motivation
4,5.0,0,[2/2] huge fan fare and big talking before the...
...,...,...,...
24778,,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...
24779,,0,"you've gone and broke the wrong heart baby, an..."
24780,,1,young buck wanna eat!!.. dat nigguh like I ain...
24781,,1,youu got wild bitches tellin you lies


In [42]:

# Configs


EMBEDDING_DIM = 50
SENTENCE_LENGTH = 256
GLOVE_FILE = f'GLOVE_FILE/glove.twitter.27B.{EMBEDDING_DIM}d.txt'


In [43]:
embed = {}

with open(GLOVE_FILE, 'r', encoding='utf-8') as file:
    for every_line in file:
        values = line.strip().split()
        w = values[0]
        vectors = np.asarray(values[1:], dtype='float32')
        embeddings[w] = vectors
        

In [44]:
import string
import re
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

ps = PorterStemmer()
translator = str.maketrans('', '', string.punctuation)

def normalize_word(text):
    # Remove white space, cast to lowercase, 
    # remove punctuation and numbers
    text = text.lower()
    text = text.translate(translator)
    text = text.strip(' ')
    text = re.sub(r'\d+', '', text)
#     text = ps.stem(text)
    return text

In [45]:
# Get number of vectors and hidden dim
with open(GLOVE_FILE, 'r') as f:
    for i, line in enumerate(f):
        pass
n_vec = i + 1
hidden_dim = len(line.split(' ')) - 1

vecs = np.zeros((n_vec, hidden_dim), dtype=np.float32)

with open(GLOVE_FILE, 'r') as f:
    for i, line in enumerate(f):
        vecs[i] = np.array([float(n) for n in line.split(' ')[1:]], dtype=np.float32)

average_vec = np.mean(vecs, axis=0)
print(average_vec)

[-0.21897273  0.1726903  -0.05617734  0.06307698  0.00960728 -0.23460951
 -0.16731708 -0.2561423   0.12990884 -0.34179983 -0.0741225   0.00533566
  0.7090389  -0.11390167  0.10614155  0.0918671   0.15881167  0.03158503
  0.22414173  0.2038661   0.05305528  0.04961339  0.11807609 -0.10199956
 -0.18345638  0.56560236  0.07183184  0.04322483 -0.3944268   0.06828406
  0.3954251   0.08794737  0.41605267 -0.27821    -0.5106839  -0.1644394
  0.09734457  0.02233139  0.19346268  0.15909804  0.8865828  -0.01498249
  0.10210968 -0.1295932  -0.32836685  0.13014711 -0.02061143  0.05735637
  0.14008194  0.22588335]


In [46]:
import string
WARN = True
def encode_sentence(sentence, embeddings, sentence_length=SENTENCE_LENGTH):
    encoded_sentence = []
    words = list(map(lambda w: normalize_word(w), sentence.split(' ')))
    for word in words:
        if word == '':
            continue
        if len(encoded_sentence) >= sentence_length:
            break
        if word in embeddings:
            word_embedding = embeddings[word]
        else:
            word_embedding = average_vec
#             continue
    
        encoded_sentence.append(word_embedding)
        
    # Zero Pad embeddings to sentence_length for LSTM batch training
    while len(encoded_sentence) < sentence_length:
        encoded_sentence.append(np.zeros((EMBEDDING_DIM)))
    return np.array(encoded_sentence)

In [47]:
stop_word = stopwords.words('english')
stm = nltk.SnowballStemmer("english")

def text_cleaner(input_text):
    input_text = re.sub(r'@[A-Za-z0-9_]+','',str(input_text))    # Removing @mentions
    input_text = re.sub(r'#','',str(input_text))                 # Removing #tag symbol
    input_text = re.sub(r'RT[\s]+',' ',input_text)          # Remvoing RT
    input_text = re.sub(r'\n','',input_text) 
    input_text = re.sub(r',','',input_text) 
    input_text = re.sub(r'.[.]+','',input_text) 
    input_text = re.sub(r'\w+:\/\/\S+','',input_text) 
    input_text = re.sub(r'https?:\/\/\S+','',input_text)    # Removing hyperlinks
    input_text = re.sub(r'/',' ',input_text)
    input_text = re.sub(r'-',' ',input_text)
    input_text = re.sub(r'_',' ',input_text)
    input_text = re.sub(r'!','',input_text)
    input_text = re.sub(r':',' ',input_text)
    input_text = re.sub(r'$','',input_text)
    input_text = re.sub(r'%','',input_text)
    input_text = re.sub(r'^','',input_text)
    input_text = re.sub(r'&','',input_text)
    input_text = re.sub(r'=',' ',input_text)
    input_text = re.sub(r' +',' ',input_text) 
    input_text = re.sub('\[.*?\]', '', input_text)
    input_text = re.sub('https?://\S+|www\.\S+', '', input_text)
    input_text = re.sub('<.*?>+', '', input_text)
    input_text = re.sub('[%s]' % re.escape(string.punctuation), '', input_text)
    input_text = re.sub('\n', '', input_text)
    input_text = re.sub('[0-9]+', '', input_text) # removing numbers
    input_text = str(input_text).lower() # converting to lowercase 
    input_text = str(input_text).strip()  # Removing all the leading and trailing whitespaces present in the input data 
    input_text = [word for word in input_text.split(' ') if word not in stop_word]
    input_text=" ".join(input_text)
    input_text = [stm.stem(word) for word in input_text.split(' ')]
    input_text=" ".join(input_text)
    return input_text 

In [48]:
def encode_X(dfx, embeddings):
    encoded_df = []
    for x in dfx.values:
        sentence_embedding = encode_sentence(x, embeddings)
        encoded_df.append(sentence_embedding)
    np.concatenate(encoded_df, axis=0)
    return np.array(encoded_df)

In [49]:
from sklearn.model_selection import train_test_split

X = df_train_final['tweet'] 
y = df_train_final['label'] 
X_train, X_test, y_train, y_test =  train_test_split(X, y, train_size = 0.8, random_state = 3) 

X_train = encode_X(X_train, embeddings)
X_test = encode_X(X_test, embeddings)

In [53]:
df_train_final['tweet']=df_train_final['tweet'].apply(text_cleaner)

In [50]:
import pickle 
loaded_model = pickle.load(open('finalized_model.sav', 'rb'))

In [54]:
X = df_train_final['tweet'].astype(str)  # Converting to string, because vectorizer does'nt accept list.
y = df_train_final['label'].astype(str)  # Converting to string, because vectorizer does'nt accept list.
X_train, X_test, y_train, y_test =  train_test_split(X, y, train_size = 0.8, random_state = 3) 
df_train_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 54313 entries, 0 to 24782
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   index   29530 non-null  float64
 1   label   54313 non-null  int64  
 2   tweet   54313 non-null  object 
dtypes: float64(1), int64(1), object(1)
memory usage: 1.7+ MB


In [79]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Extracting features using TF-IDF (1,2) - unigrams and bigrams
vectoriser = TfidfVectorizer(ngram_range=(1,2), max_features=500000)
vectoriser.fit(X_train)
test = 'You are a piece of shit. I hate you. Are you alright?'
test=[text_cleaner(test)]
print('Input text:', test)
test_vect = vectoriser.transform(test)
pred = loaded_model.predict(test_vect)
print("pred", pred)
if (pred=='1'):
    print('Text falls under hate and abusive category')
else:
    print('Text is safe.')

Input text: ['piec shi hate yo alright']
pred ['1']
Text falls under hate and abusive category
