In [30]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.decomposition import TruncatedSVD

In [7]:
df = pd.read_csv('../data/Preprocessed_data.csv')

df.head()

Unnamed: 0,tweet_with_emoji_meaning,label
0,پیرمرد وصیت احدی تاکید احدی درد دل نکنید بعدش ...,0
1,مجوز بده ملت ماشین استاندارد بتونن بیارن سوار...,0
2,دیت دختره دید زشتم می‌خواست پاشه بره آینهی دست...,1
3,اکیپ دخترونه هست پایه قراراست کلاس نمیذاره زود...,0
4,۵۰۰ نفری مراسم سالگرد پدر همسرم شرکت فوتی‌ی ۴۰...,0


In [8]:
x = df['tweet_with_emoji_meaning']
y = df['label']

In [9]:
x_train, x_test, y_train, y_test = train_test_split(x,y, random_state=42, shuffle=True, test_size=0.2)

In [10]:
x_train.shape, x_test.shape

((11740,), (2936,))

In [11]:
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
x_train_tfidf = tfidf.fit_transform(x_train)
x_test_tfidf = tfidf.transform(x_test)
x_train_tfidf.shape, x_test_tfidf.shape


((11740, 10000), (2936, 10000))

In [26]:
svd = TruncatedSVD(n_components=200)
x_train_svd = svd.fit_transform(x_train_tfidf)
x_test_svd = svd.transform(x_test_tfidf)
x_train_svd.shape, x_test_svd.shape


((11740, 200), (2936, 200))

In [31]:
models = {
    'Logistic Regression': LogisticRegression(),
    'MultinomialNB': MultinomialNB(),
    'SVC': SVC(),
    'Random Forest': RandomForestClassifier()
}


In [32]:
def cacl_metrics(model, x_train, x_test, y_train, y_test):
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    print(classification_report(y_test, y_pred))
    print('Accuracy: ',   accuracy_score(y_test, y_pred))
    

In [35]:
for model_name, model in models.items():
    print(f"Model: {model_name}")
    # if model_name == 'MultinomialNB':      
    cacl_metrics(model, x_train_tfidf, x_test_tfidf, y_train, y_test)
    # else:
        
        # cacl_metrics(model, x_train_svd, x_test_svd, y_train, y_test)
    print("-"*100)


Model: Logistic Regression
              precision    recall  f1-score   support

           0       0.76      0.80      0.78      1543
           1       0.77      0.72      0.74      1393

    accuracy                           0.76      2936
   macro avg       0.76      0.76      0.76      2936
weighted avg       0.76      0.76      0.76      2936

Accuracy:  0.7612397820163488
----------------------------------------------------------------------------------------------------
Model: MultinomialNB
              precision    recall  f1-score   support

           0       0.81      0.70      0.75      1543
           1       0.71      0.82      0.76      1393

    accuracy                           0.76      2936
   macro avg       0.76      0.76      0.76      2936
weighted avg       0.77      0.76      0.76      2936

Accuracy:  0.7588555858310627
----------------------------------------------------------------------------------------------------
Model: SVC
              precision  