In [11]:
import pandas as pd

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')


In [12]:
"Training Set:"% train.columns, train.shape, len(train)

('Training Set:', (31962, 3), 31962)

In [13]:
"Test Set:"% test.columns, test.shape, len(test)

('Test Set:', (17197, 2), 17197)

In [14]:
import re
def  clean_text(df, text_field):
    df[text_field] = df[text_field].str.lower()
    df[text_field] = df[text_field].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem))  
    return df

In [15]:
test_clean = clean_text(test, "tweet")
train_clean = clean_text(train, "tweet")

In [16]:
from sklearn.utils import resample


train_majority = train_clean[train_clean.label==0]
train_minority = train_clean[train_clean.label==1]
train_minority_upsampled = resample(train_minority, 
                                 replace=True,    
                                 n_samples=len(train_majority),   
                                 random_state=123)
train_upsampled = pd.concat([train_minority_upsampled, train_majority])
train_upsampled['label'].value_counts()

1    29720
0    29720
Name: label, dtype: int64

In [25]:
# Since the tweets regarding hate speech are comparatively lesser than others.
# So in this is a situation of an imbalanced data, if you will, fit this data to train our hate speech detection model, 
# then the model will not generalize any hate speech because the data with context to the hate speech is very less than 
# the positive ones.
# we need to prepare the data to fit properly in our model.There are a number of methods you can use to deal with this.
# One approach is to use either oversampling or down sampling.
# In the case of oversampling, we use a function that repeatedly samples with replacement from the military class until the 
# class is the same size as the majority.

# For simplicity and reproducibility of the hate speech detection model.
# We will use the scikit learn pipeline with an SGD classifier.

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier

In [18]:

pipeline_sgd = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf',  TfidfTransformer()),
    ('nb', SGDClassifier()),])

# SPLITING DATASET 

In [19]:
from sklearn.model_selection import train_test_split


In [20]:
X_train, X_test, y_train, y_test = train_test_split(train_upsampled['tweet'],
                                                    train_upsampled['label'],random_state = 0)

# MODEL TRAINING 

In [21]:
model = pipeline_sgd.fit(X_train, y_train)


In [22]:
y_predict = model.predict(X_test)


In [23]:
from sklearn.metrics import f1_score

In [24]:
f1_score(y_test, y_predict)

0.9694020398640091