In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize  
import pickle
import os

In [2]:
df=pd.read_csv("data/text_emotion.csv")

In [3]:
df.head()

Unnamed: 0,text,emotion
0,carefully word blog posts amount criticism hea...,0
1,cannot remember little mermaid feeling carefre...,1
2,not feeling super well turns cold knocked next...,1
3,feel honored part group amazing talents,1
4,think helping also began feel pretty lonely lo...,0


In [4]:
df['emotion'].value_counts()

emotion
1    134205
0    120334
Name: count, dtype: int64

In [5]:
# balanced data set

In [6]:
df.shape

(254539, 2)

In [7]:
X=df['text']
y=df['emotion']

In [8]:
#train_test split

from sklearn.model_selection import train_test_split

In [9]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42,stratify=df['emotion'])

In [10]:
# Preprocessing

In [11]:
import re
def remove_special(text):
    clean_text = re.sub(r"[^a-zA-Z]", " ", text)
    return clean_text

In [12]:

def remove_html(text):
    pattern=re.compile('<.*?>')
    return pattern.sub(r'', text)

In [13]:
def remove_extra_white_spaces(text):
    pattern=r'\s+[a-zA-Z]\s+'
    without_space=re.sub(pattern=pattern,repl=" ",string=text)
    return without_space
    

In [14]:
from nltk.corpus import stopwords
stopwords=stopwords.words('english')

In [15]:
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()
def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])

In [16]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sapta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:



def remove_stop(text):
    new_text=[]
    for word in text.split():
        if word in stopwords:
            new_text.append('')
        else:
            new_text.append(word)
    x=new_text[:]
    new_text.clear()
    
    return " ".join(x)

In [18]:
X_train=X_train.reset_index()
X_test=X_test.reset_index()

In [19]:
X_train=X_train.drop(columns=['index'],axis=1)
X_test=X_test.drop(columns=['index'],axis=1)

In [20]:
X_train.head()

Unnamed: 0,text
0,feelings victimized
1,sat chair various times said anything times al...
2,still challenging feel reassured description son
3,feel sorry people openly slander pass ill judg...
4,love earlier master presence could feel lot ca...


In [21]:
def clean_text(X_train):
    X_train['text']=X_train['text'].str.lower()
    X_train['text']=X_train['text'].apply(remove_special)
    X_train['text']=X_train['text'].apply(remove_html)
    X_train['text']=X_train['text'].apply(remove_extra_white_spaces)
    X_train['text']=X_train['text'].apply(remove_stop)
    X_train['text']=X_train['text'].apply(stem_words)
    

    return X_train

In [22]:
X_train=clean_text(X_train)
X_test=clean_text(X_test)

In [23]:
stem_words('nothappy')

'nothappi'

In [24]:
X_train.head()

Unnamed: 0,text
0,feel victim
1,sat chair variou time said anyth time alway wa...
2,still challeng feel reassur descript son
3,feel sorri peopl openli slander pass ill judge...
4,love earlier master presenc could feel lot cas...


In [25]:
tfidf=TfidfVectorizer(max_features=2000,ngram_range=(1,2))

In [26]:
X_train_tfidf=tfidf.fit_transform(X_train['text'])
X_test_tfidf=tfidf.transform(X_test['text'])

In [40]:
X_train_tfidf.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [28]:
%pip install xgboost

Collecting xgboost
  Downloading xgboost-1.7.6-py3-none-win_amd64.whl (70.9 MB)
                                              0.0/70.9 MB ? eta -:--:--
                                              0.0/70.9 MB ? eta -:--:--
                                             0.0/70.9 MB 330.3 kB/s eta 0:03:35
                                             0.1/70.9 MB 363.1 kB/s eta 0:03:16
                                             0.1/70.9 MB 476.3 kB/s eta 0:02:29
                                             0.1/70.9 MB 514.3 kB/s eta 0:02:18
                                             0.2/70.9 MB 615.9 kB/s eta 0:01:55
                                             0.2/70.9 MB 623.6 kB/s eta 0:01:54
                                             0.3/70.9 MB 710.0 kB/s eta 0:01:40
                                             0.3/70.9 MB 703.7 kB/s eta 0:01:41
                                             0.4/70.9 MB 740.5 kB/s eta 0:01:36
                                             0.5/70.9 MB

In [29]:
%pip install catboost

^C
Note: you may need to restart the kernel to use updated packages.


In [37]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,roc_auc_score,precision_score,log_loss,accuracy_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier,AdaBoostClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
#from lightgbm import LGBMClassifier

In [40]:
model=[LogisticRegression(solver='liblinear'),RandomForestClassifier(max_depth=2),MultinomialNB(),GradientBoostingClassifier(),AdaBoostClassifier()]
for m in model:
    m.fit(X_train_tfidf,y_train)
    # get importance
    y_pred=m.predict(X_test_tfidf)
    y_pred_train=m.predict(X_train_tfidf)
    print(m," ","Test: ",accuracy_score(y_test,y_pred)," ","Train: ",accuracy_score(y_train,y_pred_train))
    print("="*30,"\n")

LogisticRegression(solver='liblinear')   Test:  0.950636442209476   Train:  0.9533076987295648

RandomForestClassifier(max_depth=2)   Test:  0.5448063172782274   Train:  0.5452067710712023

MultinomialNB()   Test:  0.9401272884418952   Train:  0.9416198908810545

GradientBoostingClassifier()   Test:  0.8605916555354758   Train:  0.8640432939974758

AdaBoostClassifier()   Test:  0.7406105130824232   Train:  0.7438995044958774



In [None]:
m=CatBoostClassifier(X_train)
m.fit(X_train_tfidf,y_train)
    # get importance
y_pred=m.predict(X_test_tfidf)
y_pred_train=m.predict(X_train_tfidf)
print(m," ","Test: ",accuracy_score(y_test,y_pred)," ","Train: ",accuracy_score(y_train,y_pred_train))
print("="*30,"\n")

In [None]:
from sklearn.ensemble import StackingClassifier
lr=LogisticRegression(solver='liblinear')
mnb=MultinomialNB()
gbc=GradientBoostingClassifier()

In [None]:
estimators=[
    ('lr',lr),
    ('lgbm',lgbm),
    ('gbc',gbc),
    ('mlp',mlp),
    ('rfc',rfc)
]
​

In [41]:
from sklearn.linear_model import LogisticRegression
model=LogisticRegression(solver='liblinear')

In [42]:
model.fit(X_train_tfidf,y_train)

In [43]:
y_pred_test=model.predict(X_test_tfidf)
y_pred_train=model.predict(X_train_tfidf)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,roc_auc_score,precision_score,log_loss
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier,AdaBoostClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier


In [None]:
from sklearn.metrics import accuracy_score
print("TRAIN: ",accuracy_score(y_train,y_pred_train)," ","TEST: ",accuracy_score(y_test,y_pred_test))

TRAIN:  0.9533076987295648   TEST:  0.950636442209476


'artifacts\\model.pkl'

In [1]:
data='saptarsi das is good boy'

In [5]:
new_data=pd.Series(data)

In [6]:
new_data

0    saptarsi das is good boy
dtype: object

In [46]:
model = pickle.load(open('clssifier.pkl','rb'))

FileNotFoundError: [Errno 2] No such file or directory: 'clssifier.pkl'