In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import re
import nltk
import string
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pd.set_option('display.max_colwidth',200)
import warnings
warnings.filterwarnings('ignore',category = DeprecationWarning)

In [None]:
train = pd.read_csv('/kaggle/input/twitter-sentiment-analysis-hatred-speech/train.csv')
test = pd.read_csv('../input/twitter-sentiment-analysis-hatred-speech/test.csv')

In [None]:
train.shape

In [None]:
ltrain = train['tweet'].str.len()
ltest = test['tweet'].str.len()

In [None]:
plt.hist(ltrain,bins =20, label = 'train')
plt.hist(ltest,bins =20, label = 'test')


In [None]:
combi = train.append(test,ignore_index = True)

In [None]:
combi

In [None]:
def cleaner(inp, pattern):
    r = re.findall(pattern,inp)
    for i in r:
        inp = re.sub(i,'',inp)
    return inp

In [None]:
combi['clean'] = np.vectorize(cleaner)(combi['tweet'],"@[\w]*")

In [None]:
combi['clean'] = combi['clean'].str.replace("[^a-zA-Z#]", " ")

In [None]:
combi['clean'] = combi['clean'].apply(lambda x: " ".join(y for y in x.split() if len(y)>3))

In [None]:
combi.head()

In [None]:
tokens = combi['clean'].apply(lambda x:x.split())

In [None]:
from nltk.stem.porter import *
stemmer = PorterStemmer()

In [None]:
tokens = tokens.apply(lambda x:[stemmer.stem(i) for i in x])

In [None]:
for i in range(len(tokens)):
    tokens[i] = " ".join(tokens[i])
combi['clean'] = tokens

In [None]:
all_words = ' '.join([text for text in combi['clean']])
good_words = ' '.join([text for text in combi['clean'][combi['label']==0]])
bad_words = ' '.join([text for text in combi['clean'][combi['label']==1]])

In [None]:
def extract_trend(x):
    hashtags = []
    for i in x:
        h = re.findall(r"#(\w+)",i)
        hashtags.append(h)
    return hashtags

In [None]:
ht_good = extract_trend(combi['clean'][combi['label']==0])
ht_bad = extract_trend(combi['clean'][combi['label']==1])

In [None]:
ht_good = sum(ht_good, [])
ht_bad = sum(ht_bad, [])

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
bagofwords = CountVectorizer()
bow = bagofwords.fit_transform(combi['clean'])
bow.shape

In [None]:
tidifv = TfidfVectorizer()
tidif = tidifv.fit_transform(combi['clean'])
tidif.shape

In [None]:
import gensim

In [None]:
ttoken = combi['clean'].apply(lambda x:x.split())
w2v = gensim.models.Word2Vec(ttoken, vector_size = 10000, window = 5, min_count =2, sg = 0)
w2v.train(ttoken, total_examples = len(combi['clean']),epochs =20)

In [None]:
w2v.wv.most_similar(positive = 'trump')

In [None]:
(w2v.wv['trump'])

In [None]:
def word_vector(token, size):
    vec = np.zeros(size).reshape((1,size))
    count =0
    for word in token:
        try:
            vec+= w2v.wv[word].reshape((1,size))
            count+=1
        except:
            continue
    if count!=0:
        vec/=count
    return vec

In [None]:
wvarray = np.zeros((len(ttoken),10000))
for i in range(len(ttoken)):
    wvarray[i,:] = word_vector(ttoken[i],10000)
wvdf = pd.DataFrame(wvarray)

In [None]:
wvdf.shape

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [None]:
train_bow = bow[:31962,:]
test_bow = bow[31962:,:]

(xtrain,xvalid,ytrain,yvalid)= train_test_split(train_bow,train['label'],test_size = 0.3,random_state =1)

lreg = LogisticRegression()
lreg.fit(xtrain,ytrain)
prediction = lreg.predict_proba(xvalid)
prediction_int = prediction[:,1]>=0.3
prediction_int = prediction_int.astype(np.int)



In [None]:
f1_score(yvalid, prediction_int)


In [None]:
test_pred = lreg.predict_proba(test_bow)
test_pred_i = test_pred[:,1]>=0.3
test_pred_i = test_pred_i.astype(np.int)
test['label'] = test_pred_i
submission = test[['id','label']]
submission.to_csv('sub_lreg_bow.csv',index = False)

In [None]:
train_tidif = tidif[:31962,:]
test_tidif = tidif[31962:,:]

xtrain_tidif = train_tidif[ytrain.index]
xvalid_tidif = train_tidif[yvalid.index]


lreg.fit(xtrain_tidif,ytrain)
prediction = lreg.predict_proba(xvalid_tidif)
prediction_int = prediction[:,1]>=0.3
prediction_int = prediction_int.astype(np.int)

print(f1_score(yvalid, prediction_int))

test_pred = lreg.predict_proba(test_tidif)
test_pred_i = test_pred[:,1]>=0.3
test_pred_i = test_pred_i.astype(np.int)
test['label'] = test_pred_i
submission = test[['id','label']]
submission.to_csv('sub_lreg_tidif.csv',index = False)

In [None]:
ytrain.index

In [None]:
train_w2v = wvdf.iloc[:31962,:]
test_w2v = wvdf.iloc[31962:,:]

xtrain_w2v = train_w2v.iloc[ytrain.index,:]
xvalid_w2v = train_w2v.iloc[yvalid.index,:]

lreg = LogisticRegression(solver='liblinear')
lreg.fit(xtrain_w2v,ytrain)
prediction = lreg.predict_proba(xvalid_w2v)
prediction_int = prediction[:,1]>=0.3
prediction_int = prediction_int.astype(np.int)
print(f1_score(yvalid, prediction_int))

test_pred = lreg.predict_proba(test_w2v)
test_pred_i = test_pred[:,1]>=0.3
test_pred_i = test_pred_i.astype(np.int)
test['label'] = test_pred_i
submission = test[['id','label']]
submission.to_csv('sub_lreg_w2v.csv',index = False)

In [None]:
from sklearn import svm

In [None]:
train_w2v = wvdf.iloc[:31962,:]
test_w2v = wvdf.iloc[31962:,:]

xtrain_w2v = train_w2v.iloc[ytrain.index,:]
xvalid_w2v = train_w2v.iloc[yvalid.index,:]

lreg = svm.SVC(kernel = 'linear',C=1,probability = True)
lreg.fit(xtrain_w2v,ytrain)
prediction = lreg.predict_proba(xvalid_w2v)
prediction_int = prediction[:,1]>=0.3
prediction_int = prediction_int.astype(np.int)
print(f1_score(yvalid, prediction_int))

test_pred = lreg.predict_proba(test_w2v)
test_pred_i = test_pred[:,1]>=0.3
test_pred_i = test_pred_i.astype(np.int)
test['label'] = test_pred_i
submission = test[['id','label']]
submission.to_csv('sub_lreg_w2v.csv',index = False)

In [None]:
from xgboost import XGBClassifier

In [None]:
xgb_model = XGBClassifier(max_depth = 8, n_estimators = 1000, nthread = 3,
                          colsample = 0.9, eta = 0.1, subsample = 0.9,
                          min_child_weight = 6, objective = 'binary:logistic'
                          ,eval_metric='logloss')
xgb_model.fit(xtrain_w2v,ytrain)
predict = xgb_model.predict(xvalid_w2v)
f1_score(yvalid,predict)



In [None]:
test_pred = xgb_model.predict(test_w2v)
test_pred_i = test_pred_i.astype(np.int)
test['label'] = test_pred
submission = test[['id','label']]
submission.to_csv('sub_xgb_w2v.csv',index = False)

In [None]:
xtrain_w2v

In [None]:
xgb_model.fit(xtrain,ytrain)
predict = xgb_model.predict(xvalid)
f1_score(yvalid,predict)


In [None]:
test_pred = xgb_model.predict(test_bow)
test_pred_i = test_pred_i.astype(np.int)
test['label'] = test_pred
submission = test[['id','label']]
submission.to_csv('sub_xgb_bow.csv',index = False)