# Import Libraries

In [1]:
import pandas as pd
import numpy as np

from textblob import TextBlob
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import xgboost, numpy, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers


from warnings import filterwarnings
filterwarnings('ignore')

2022-11-14 11:51:17.708820: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Text Preprocessing

In [2]:
## Recap 01
data = pd.read_csv('film_comments.tsv', sep='\t')

# Change 0,1,3,4 to negatif and pozitif
data.Sentiment.replace(0, 'negative', inplace=True)
data.Sentiment.replace(1, 'negative', inplace=True)

data.Sentiment.replace(3, 'pozitive', inplace=True)
data.Sentiment.replace(4, 'pozitive', inplace=True)

# Select only Negatif and Pozitif, remove 2 Sentiments
data = data[(data.Sentiment == 'negative') | (data.Sentiment == 'pozitive')]

# Df DataFrame
df = pd.DataFrame()
df['text'] = data.Phrase
df['label'] = data.Sentiment

# LOWER 
df['text'] = df['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))

# Punctuation Marks
df['text'] = df['text'].str.replace('[^\w\s]','')

# Digits
df['text'] = df['text'].str.replace('\d','')

# Stopwords
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
sw = stopwords.words('english')
df['text'] = df['text'].apply(lambda x: " ".join(x for x in x.split() if x not in sw))

# Low Frequency Word
delete = pd.Series(' '.join(df['text']).split()).value_counts()[-1000:]
df['text'] = df['text'].apply(lambda x: " ".join(x for x in x.split() if x not in delete))

#lemmi
from textblob import Word
#nltk.download('wordnet')
df['text'] = df['text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()])) 

df.head()

Unnamed: 0,text,label
0,series demonstrating adage good goose also goo...,negative
21,good goose,pozitive
22,good,pozitive
33,gander occasionally amuses none amount much story,negative
46,amuses,pozitive


# Test - Train Split

In [3]:
# Test-Train Split
train_x, test_x, train_y, test_y = model_selection.train_test_split(df["text"],
                                                                   df["label"], 
                                                                    random_state = 1)

encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
test_y = encoder.fit_transform(test_y)

# Count Vectors

In [4]:
# Count Vectors
vectorizer = CountVectorizer()
vectorizer.fit(train_x)

x_train_count = vectorizer.transform(train_x)
x_test_count = vectorizer.transform(test_x)

# TF - IDF

## World Level

In [5]:
tf_idf_word_vectorizer = TfidfVectorizer()
tf_idf_word_vectorizer.fit(train_x)

x_train_tf_idf_word = tf_idf_word_vectorizer.transform(train_x)
x_test_tf_idf_word = tf_idf_word_vectorizer.transform(test_x)

## N-Gram Level tf-idf

In [6]:
tf_idf_ngram_vectorizer = TfidfVectorizer(ngram_range = (2,3))
tf_idf_ngram_vectorizer.fit(train_x)

x_train_tf_idf_ngram = tf_idf_ngram_vectorizer.transform(train_x)
x_test_tf_idf_ngram = tf_idf_ngram_vectorizer.transform(test_x)

## Character Level tf-idf

In [7]:
tf_idf_chars_vectorizer = TfidfVectorizer(analyzer = "char", ngram_range = (2,3))
tf_idf_chars_vectorizer.fit(train_x)

x_train_tf_idf_chars = tf_idf_chars_vectorizer.transform(train_x)
x_test_tf_idf_chars = tf_idf_chars_vectorizer.transform(test_x)

# Machine Learning Models 

## Logistic Regression

In [8]:
loj = linear_model.LogisticRegression()
loj_model_count = loj.fit(x_train_count, train_y)
accuracy_loj_count = model_selection.cross_val_score(loj_model_count, 
                                           x_test_count, 
                                           test_y, 
                                           cv = 10).mean()

print("Count Vectors Accuracy Ratio:", accuracy_loj_count)

Count Vectors Accuracy Ratio: 0.8368200836820083


In [9]:
loj = linear_model.LogisticRegression()
loj_model_word = loj.fit(x_train_tf_idf_word,train_y)
accuracy_loj_word = model_selection.cross_val_score(loj_model_word, 
                                           x_test_tf_idf_word, 
                                           test_y, 
                                           cv = 10).mean()

print("Word-Level TF-IDF Accuracy Ratio:", accuracy_loj_word)

Word-Level TF-IDF Accuracy Ratio: 0.8331589958158995


In [10]:
loj = linear_model.LogisticRegression()
loj_model_ngram = loj.fit(x_train_tf_idf_ngram,train_y)
accuracy_loj_ngram = model_selection.cross_val_score(loj_model_ngram, 
                                           x_test_tf_idf_ngram, 
                                           test_y, 
                                           cv = 10).mean()

print("N-GRAM TF-IDF Accuracy Ratio:", accuracy_loj_ngram)

N-GRAM TF-IDF Accuracy Ratio: 0.748326359832636


In [11]:
loj = linear_model.LogisticRegression()
loj_model_char = loj.fit(x_train_tf_idf_chars,train_y)
accuracy_loj_char = model_selection.cross_val_score(loj_model_char, 
                                           x_test_tf_idf_chars, 
                                           test_y, 
                                           cv = 10).mean()

print("CHARLEVEL Accuracy Ratio:", accuracy_loj_char)

CHARLEVEL Accuracy Ratio: 0.7811715481171548


## Naive Bayes

In [12]:
nb = naive_bayes.MultinomialNB()
nb_model_count = nb.fit(x_train_count,train_y)
accuracy_nb_count = model_selection.cross_val_score(nb_model_count, 
                                           x_test_count, 
                                           test_y, 
                                           cv = 10).mean()

print("Count Vectors Accuracy Ratio:", accuracy_nb_count)

Count Vectors Accuracy Ratio: 0.8332112970711296


In [13]:
nb = naive_bayes.MultinomialNB()
nb_model_word = nb.fit(x_train_tf_idf_word,train_y)
accuracy_nb_word = model_selection.cross_val_score(nb_model_word, 
                                           x_test_tf_idf_word, 
                                           test_y, 
                                           cv = 10).mean()

print("Word-Level TF-IDF Accuracy Ratio:", accuracy_nb_word)

Word-Level TF-IDF Accuracy Ratio: 0.835041841004184


In [14]:
nb = naive_bayes.MultinomialNB()
nb_model_ngram = nb.fit(x_train_tf_idf_ngram,train_y)
accuracy_nb_ngram = model_selection.cross_val_score(nb_model_ngram, 
                                           x_test_tf_idf_ngram, 
                                           test_y, 
                                           cv = 10).mean()

print("N-GRAM TF-IDF Accuracy Ratio:", accuracy_nb_ngram)

N-GRAM TF-IDF Accuracy Ratio: 0.7685146443514643


In [15]:
nb = naive_bayes.MultinomialNB()
nb_model_char = nb.fit(x_train_tf_idf_chars,train_y)
accuracy_nb_char = model_selection.cross_val_score(nb_model_char, 
                                           x_test_tf_idf_chars, 
                                           test_y, 
                                           cv = 10).mean()

print("CHARLEVEL Accuracy Ratio:", accuracy_nb_char)

CHARLEVEL Accuracy Ratio: 0.7557008368200837


## Random Forests

In [16]:
#rf = ensemble.RandomForestClassifier()
#rf_model = rf.fit(x_train_count,train_y)
#accuracy = model_selection.cross_val_score(rf_model, 
#                                           x_test_count, 
#                                           test_y, 
#                                           cv = 10).mean()

#print("Count Vectors Doğruluk Oranı:", accuracy)

In [17]:
#rf = ensemble.RandomForestClassifier()
#rf_model = rf.fit(x_train_tf_idf_word,train_y)
#accuracy = model_selection.cross_val_score(rf_model, 
#                                           x_test_tf_idf_word, 
#                                           test_y, 
#                                           cv = 10).mean()

#print("Word-Level TF-IDF Accuracy Ratio:", accuracy)

In [18]:
#rf = ensemble.RandomForestClassifier()
#rf_model = loj.fit(x_train_tf_idf_ngram,train_y)
#accuracy = model_selection.cross_val_score(rf_model, 
#                                           x_test_tf_idf_ngram, 
#                                           test_y, 
#                                           cv = 10).mean()

#print("N-GRAM TF-IDF Accuracy Ratio:", accuracy)

In [19]:
#rf = ensemble.RandomForestClassifier()
#rf_model = loj.fit(x_train_tf_idf_chars,train_y)
#accuracy = model_selection.cross_val_score(rf_model, 
#                                           x_test_tf_idf_chars, 
#                                           test_y, 
#                                           cv = 10).mean()

#print("CHARLEVEL Accuracy Ratio:", accuracy)

## XGBoost

In [20]:
xgb = xgboost.XGBClassifier()
xgb_model_count = xgb.fit(x_train_count,train_y)
accuracy_xgb_count = model_selection.cross_val_score(xgb_model_count, 
                                           x_test_count, 
                                           test_y, 
                                           cv = 10).mean()

print("Count Vectors Accuracy Ratio:", accuracy_xgb_count)

Count Vectors Accuracy Ratio: 0.7153242677824267


In [21]:
xgb = xgboost.XGBClassifier()
xgb_model_word = xgb.fit(x_train_tf_idf_word,train_y)
accuracy_xgb_word = model_selection.cross_val_score(xgb_model_word, 
                                           x_test_tf_idf_word, 
                                           test_y, 
                                           cv = 10).mean()

print("Word-Level TF-IDF Accuracy Ratio:", accuracy_xgb_word)

Word-Level TF-IDF Accuracy Ratio: 0.7080020920502091


In [22]:
xgb = xgboost.XGBClassifier()
xgb_model_ngram = xgb.fit(x_train_tf_idf_ngram,train_y)
accuracy_xgb_ngram = model_selection.cross_val_score(xgb_model_ngram, 
                                           x_test_tf_idf_ngram, 
                                           test_y, 
                                           cv = 10).mean()

print("N-GRAM TF-IDF Accuracy Ratio:", accuracy_xgb_ngram)

N-GRAM TF-IDF Accuracy Ratio: 0.5827928870292888


In [23]:
xgb = xgboost.XGBClassifier()
xgb_model_char = xgb.fit(x_train_tf_idf_chars,train_y)
accuracy_xgb_char = model_selection.cross_val_score(xgb_model_char, 
                                           x_test_tf_idf_chars, 
                                           test_y, 
                                           cv = 10).mean()

print("CHARLEVEL Accuracy Ratio:", accuracy_xgb_char)

CHARLEVEL Accuracy Ratio: 0.7783472803347281


## Compare ML Models

In [24]:
d = {'Type' : ['count', 'word', 'n_gram', 'character'],
     'Logistic_Reg_Acc' : [accuracy_loj_count, accuracy_loj_word, accuracy_loj_ngram, accuracy_loj_char],
     'NavieBayes_Acc' : [accuracy_nb_count, accuracy_loj_word, accuracy_nb_ngram, accuracy_nb_char],
     'XGB_Acc' : [accuracy_xgb_count, accuracy_xgb_word, accuracy_xgb_ngram, accuracy_xgb_char]}

df_compare = pd.DataFrame(d )
df_compare

Unnamed: 0,Type,Logistic_Reg_Acc,NavieBayes_Acc,XGB_Acc
0,count,0.83682,0.833211,0.715324
1,word,0.833159,0.833159,0.708002
2,n_gram,0.748326,0.768515,0.582793
3,character,0.781172,0.755701,0.778347


# Prediction 

In [25]:
new_comment01 = 'yes i like this film'

# With Count Vector and Lojistic Reg.
loj = linear_model.LogisticRegression()
loj_model = loj.fit(x_train_count, train_y)

v = CountVectorizer()
v.fit(train_x)

new_comment01 = pd.Series(new_comment01)
new_comment01 = v.transform(new_comment01)
loj_model_count.predict(new_comment01)

array([1])

In [26]:
# With Count Vector and Lojistic Reg.
new_comment01 = 'yes i like this film'
new_comment02 = 'this film is very nice and good i like it'
new_comment03 = 'no not good look at that shit very bad'
new_comment04 = 'not a good idea'

L = [new_comment01, new_comment02, new_comment03, new_comment04]
results = list(filter(lambda x: print(loj_model_count.predict(v.transform(pd.Series(x)))), L))

[1]
[1]
[0]
[1]


In [27]:
# With Ngram Vector and Lojistic Reg.
new_comment01 = pd.Series('yes i like this film')
new_comment02 = pd.Series('this film is very nice and good i like it') 
new_comment03 = pd.Series('no not good look at that shit very bad')
new_comment04 = pd.Series('not a good idea')

L = [new_comment01, new_comment02, new_comment03, new_comment04]

v = TfidfVectorizer(ngram_range = (2,3))
v.fit(train_x)

results = list(filter(lambda x: print(loj_model_ngram.predict(v.transform(x))), L))

[1]
[1]
[1]
[1]


In [28]:
# With Word Vector and Lojistic Reg.
new_comment01 = pd.Series('yes i like this film')
new_comment02 = pd.Series('this film is very nice and good i like it') 
new_comment03 = pd.Series('no not good look at that shit very bad')
new_comment04 = pd.Series('not good')

L = [new_comment01, new_comment02, new_comment03, new_comment04]

v = TfidfVectorizer()
v.fit(train_x)

results = list(filter(lambda x: print(loj_model_word.predict(v.transform(x))), L))

[1]
[1]
[0]
[1]


## Ready Sentiment Analysis on https://huggingface.co/models

## English

In [None]:
from transformers import pipeline
sentiment_pipeline = pipeline("sentiment-analysis")

data = ["I love you", "I hate you"]
data2 = ['yes i like this film',
         'this film is very nice and good i like it',
         'no not good look at that shit very bad',
         'not good']

sentiment_pipeline(data2)

## German (positive - negative)

In [None]:
from germansentiment import SentimentModel

model = SentimentModel()

texts = [
    "Mit keinem guten Ergebniss","Das ist gar nicht mal so gut",
    "Total awesome!","nicht so schlecht wie erwartet",
    "Der Test verlief positiv.","Sie fährt ein grünes Auto."]
       
result = model.predict_sentiment(texts)
print(result)