In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('/kaggle/input/emotion-classification-nlp/emotion-labels-train.csv')
test = pd.read_csv('/kaggle/input/emotion-classification-nlp/emotion-labels-test.csv')

In [None]:
print(train.columns,test.columns)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(10,5))
plt.subplot(1,2,1)
sns.countplot(x = train.label)
plt.xlabel("Label distribution of train data")
plt.subplot(1,2,2)
sns.countplot(x = test.label)
plt.xlabel("Label distribution of test data")

Distribution of labels in both train and test datasets is similar 

In [None]:
puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£',
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…',
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─',
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞',
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

def clean_text(data):
    stop = stopwords.words('english')
    res = []
    data['text'] = data['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
    for x in data['text']:
        x = str(x)
        for punct in puncts:
            if punct in x:
                    x = x.replace(punct,' ')
        res.append(x)
    return res

In [None]:

def word_count(df):
    word_count = []
    for i in df['text']:
        word = i.split()
        word_count.append(len(word))
    return word_count
train['word_count'] = word_count(train)
test['word_count'] = word_count(test)

In [None]:
train_joy = train[train.label == 'joy']
train_anger = train[train.label == 'anger']
train_fear = train[train.label == 'fear']
train_sadness = train[train.label == 'sadness']
test_joy = test[test.label == 'joy']
test_anger = test[test.label == 'anger']
test_fear = test[test.label == 'fear']
test_sadness = test[test.label == 'sadness']

In [None]:
plt.figure(figsize=(20,5))
plt.subplot(1,4,1)
plt.plot(train_joy.word_count)
plt.xlabel("Word distribution of train data for class joy")
plt.subplot(1,4,2)
plt.plot(train_anger.word_count)
plt.xlabel("Word distribution of train data for class anger")
plt.subplot(1,4,3)
plt.plot(train_fear.word_count)
plt.xlabel("Word distribution of train data for class fear")
plt.subplot(1,4,4)
plt.plot(train_sadness.word_count)
plt.xlabel("Word distribution of train data for class sadness")

In [None]:
plt.figure(figsize=(20,5))
plt.subplot(1,4,1)
plt.plot(test_joy.word_count)
plt.xlabel("Word distribution of test data for class joy")
plt.subplot(1,4,2)
plt.plot(test_anger.word_count)
plt.xlabel("Word distribution of test data for class anger")
plt.subplot(1,4,3)
plt.plot(test_fear.word_count)
plt.xlabel("Word distribution of test data for class fear")
plt.subplot(1,4,4)
plt.plot(test_sadness.word_count)
plt.xlabel("Word distribution of test data for class sadness")

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
y_train = le.fit_transform(train.label)
y_test = le.transform(test.label)

In [None]:
X = train['word_count'].values.reshape(-1,1)
X_test = test['word_count'].values.reshape(-1,1)

In [None]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state = 5,max_depth=4,splitter='best')
clf.fit(X,y_train)
y_preds = clf.predict(X_test)
from sklearn.metrics import accuracy_score
acc_score = accuracy_score(y_test,y_preds)
print(acc_score)

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=0.3)
lr.fit(X,y_train)
y_preds = lr.predict(X_test)
acc_lr_score = accuracy_score(y_test,y_preds)
print(acc_lr_score)

**Using count vectorizer to vectorize the text variable**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
X_vec = vect.fit_transform(train['text'])
X_test_vec = vect.transform(test['text'])
clf.fit(X_vec,y_train)
y_preds = clf.predict(X_test_vec)
#from sklearn.metrics import accuracy_score
acc_score = accuracy_score(y_test,y_preds)
print(acc_score)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_vec,y_train)
y_preds = rf.predict(X_test_vec)
#from sklearn.metrics import accuracy_score
acc_score = accuracy_score(y_test,y_preds)
print(acc_score)

In [None]:
!pip install xgboost

In [None]:
from xgboost import XGBClassifier
model = XGBClassifier()
model.fit(X_vec,y_train)
y_preds = model.predict(X_test_vec)
#from sklearn.metrics import accuracy_score
acc_score = accuracy_score(y_test,y_preds)
print(acc_score)

**Try to use glove embeddings**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vect_tf = TfidfVectorizer(ngram_range=(1,1),stop_words='english',max_features=3500)
X_vec_tf = vect_tf.fit_transform(train['text'])
X_test_vec_tf = vect_tf.transform(test['text'])
model.fit(X_vec_tf,y_train)
y_preds = model.predict(X_test_vec_tf)
#from sklearn.metrics import accuracy_score
acc_score = accuracy_score(y_test,y_preds)
print(acc_score)

In [None]:
rf.fit(X_vec_tf,y_train)
y_preds = rf.predict(X_test_vec_tf)
#from sklearn.metrics import accuracy_score
acc_score = accuracy_score(y_test,y_preds)
print(acc_score)

In [None]:
import tensorflow_hub as hub
import lightgbm as lgb

def generate_embeddings(X_train,X_test,y_train,y_test):
    def embed_document(data):
        model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
        embeddings = np.array([np.array(model([i])) for i in data])
        return pd.DataFrame(np.vstack(embeddings))
    # vectorize the data
    X_train_vec = embed_document(X_train)
    X_test_vec = embed_document(X_test)
    # USE doesn't have feature names
    model = XGBClassifier(n_estimators=1000,learning_rate=0.001,max_depth=5,n_jobs=8)
    #print(X_train_vec.shape,y_train.shape)
    model.fit(X_train_vec, y_train)
    model.score(X_test_vec, y_test)
    ypred = model.predict(X_test_vec)
    print('XGBoost scores')
    #score = roc_auc_score(ypred,y_test)
    #print(score)
    accuracy = accuracy_score(y_test, ypred)
    print(accuracy)
    print("-------------------------------------------------------------------")
    print('LightGBM scores:')
    clf = lgb.LGBMClassifier()
    clf.fit(X_train_vec, y_train)
    clf.score(X_test_vec, y_test)
    ypred = clf.predict(X_test_vec)
    #score = roc_auc_score(ypred,y_test)
    #print(score)
    accuracy = accuracy_score(y_test, ypred)
    print(accuracy)
    

In [None]:
X_train = train['text']
X_test = test['text']
generate_embeddings(X_train,X_test,y_train,y_test)

In [None]:
puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£',
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…',
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─',
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞',
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

def clean_text(data):
    stop = stopwords.words('english')
    res = []
    data['text'] = data['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
    for x in data['text']:
        x = str(x)
        for punct in puncts:
            if punct in x:
                    x = x.replace(punct,' ')
        res.append(x)
    return res

In [None]:
from nltk.corpus import stopwords
processed_train_text = clean_text(train)
processed_test_text = clean_text(test)
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
X_vec = vect.fit_transform(processed_train_text)
X_test_vec = vect.transform(processed_test_text)
model = XGBClassifier()
model.fit(X_vec,y_train)
y_preds = model.predict(X_test_vec)
#from sklearn.metrics import accuracy_score
acc_score = accuracy_score(y_test,y_preds)
print(acc_score)

In [None]:
!pip install gradio
import gradio as gr


In [None]:
def greet(input_tweet):
    x = [input_tweet]
    input_text = vect.transform(x)
    X_test = pd.DataFrame.from_dict({'text':[input_text]}) 
    print(X_test)
    y_predict = model.predict(X_test.values)
    print(y_predict)
    return y_predict[0]     
iface = gr.Interface( 
  fn = greet,
  inputs=gr.inputs.Textbox(lines=5, placeholder="Enter your tweet here..."),  
  outputs="number")
iface.launch(share=True)

In [None]:
x = model.get_booster().feature_names
x