In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/emotion-detection-from-text/tweet_emotions.csv


## Importing Libraries

In [2]:
import re
import nltk
from nltk.corpus import stopwords 
from sklearn.model_selection import train_test_split 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.naive_bayes import MultinomialNB 
from sklearn.linear_model import LogisticRegression
from textblob import Word

In [3]:
!python -m textblob.download_corpora
nltk.download('omw-1.4')

[nltk_data] Downloading package brown to /usr/share/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package conll2000 to /usr/share/nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!
[nltk_data] Downloading package movie_reviews to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
Finished.


[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...


True

## Loading and preprocessing data

In [4]:
train = pd.read_csv("/kaggle/input/emotion-detection-from-text/tweet_emotions.csv")

In [5]:
train = train.drop(['tweet_id'],axis=1)
train.head()

Unnamed: 0,sentiment,content
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...


In [6]:
train['sentiment'].unique()

array(['empty', 'sadness', 'enthusiasm', 'neutral', 'worry', 'surprise',
       'love', 'fun', 'hate', 'happiness', 'boredom', 'relief', 'anger'],
      dtype=object)

In [7]:
train.isnull().sum()

sentiment    0
content      0
dtype: int64

In [8]:
STOPWORDS = stopwords.words("english")

In [9]:
def clean(text):
    text = text.lower()
    text = re.sub("[^\w\s]","",text) # Remove punctuations 
    text = " ".join(w for w in text.split() if w not in STOPWORDS)
    text = " ".join([Word(word).lemmatize() for word in text.split()])
    return text

In [10]:
def remove_duplicates(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

In [11]:
train['text'] = train['content'].apply(lambda x : clean(x)) 
train['text'] = train['text'].apply(lambda x :  remove_duplicates(x))

In [12]:
freq=  pd.Series(" ".join(train['text']).split()).value_counts()[-10000:] # Now remove rare words that appears in the list

In [13]:
freq =  list(freq.index)

In [14]:
train['text'] = train['text'].apply(lambda x:  " ".join(x for x in x.split() if x not in freq))

In [15]:
train.head()

Unnamed: 0,sentiment,content,text
0,empty,@tiffanylue i know i was listenin to bad habi...,tiffanylue know listenin bad habit earlier sta...
1,sadness,Layin n bed with a headache ughhhh...waitin o...,layin n bed headache ughhwaitin call
2,sadness,Funeral ceremony...gloomy friday...,funeral ceremonygloomy friday
3,enthusiasm,wants to hang out with friends SOON!,want hang friend soon
4,neutral,@dannycastillo We want to trade with someone w...,dannycastillo want trade someone houston ticke...


In [16]:
x = np.array(train['text'].values)[:20000]
y =  np.array(train['sentiment'].values)[:20000]

In [17]:
Le = LabelEncoder()
y =  Le.fit_transform(y)

In [18]:
print("x Shape : {}".format(x.shape))
print("Y shape : {}".format(y.shape))

x Shape : (20000,)
Y shape : (20000,)


In [19]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,shuffle=True,random_state=42,stratify=y)

In [20]:
x_train.shape

(16000,)

In [21]:
x_train

array(['hahaha thanks clearing', 'sigh guess gonna meet today',
       'leaving utah today super sad face', ...,
       'bummer might make sunday show hopefully sun shine cloudy day',
       'im fine bit tired im glad weekend',
       'almost tomorrow 8 start 291kms mud httpmobypicturecomzx33t1'],
      dtype=object)

In [22]:
y

array([ 2, 10, 10, ...,  8, 11, 12])

## Using TfidfVectorizer

In [23]:
tf = TfidfVectorizer(analyzer='word',max_features=1000,ngram_range=(1,3))
x_tf = tf.fit_transform(x_train)
x_val_tf = tf.transform(x_test)

In [24]:
x_tf   = x_tf.toarray()
x_val_tf =  x_val_tf.toarray()

In [25]:
x_tf

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

### Time To Train Models

### 1) MultinomialNB

In [26]:
model =MultinomialNB()
model.fit(x_tf,y_train)

MultinomialNB()

In [27]:
model.score(x_val_tf,y_test)

0.3325

## 2) LogisticRegression

In [28]:
model =LogisticRegression(solver='liblinear',C=1)
model.fit(x_tf,y_train)

LogisticRegression(C=1, solver='liblinear')

In [29]:
model.score(x_val_tf,y_test)

0.34375

## 3) RandomForestClassifier

In [30]:
model =RandomForestClassifier()
model.fit(x_tf,y_train)

RandomForestClassifier()

In [31]:
model.score(x_val_tf,y_test)

0.3415

In [32]:
##### Thanks #######