### Sentiment Analizi ve Sınıflandırma Modelleri

In [16]:
#pip install keras

In [115]:
from textblob import TextBlob
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas, xgboost, numpy, textblob, string
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras import layers, models, optimizers


In [2]:
import pandas as pd
data = pd.read_csv("train.tsv", sep = "\t")

In [3]:
data.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [4]:
#   verileri yeniden etkilecegiz     binary  classification  etiketleme seklinde 
data['Sentiment'].value_counts()

Sentiment
2    79582
3    32927
1    27273
4     9206
0     7072
Name: count, dtype: int64

In [5]:
data['Sentiment'].replace(0, value="negatif", inplace=True)
data['Sentiment'].replace(1, value="negatif", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Sentiment'].replace(0, value="negatif", inplace=True)


In [6]:
data['Sentiment'].replace(3, value="pozitif", inplace=True)
data['Sentiment'].replace(4, value="pozitif", inplace=True)

In [7]:
data.tail(10)

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
156050,156051,8544,the sadness and obsession,2
156051,156052,8544,sadness and obsession,negatif
156052,156053,8544,sadness and,negatif
156053,156054,8544,beneath Hearst 's forced avuncular chortles,2
156054,156055,8544,Hearst 's forced avuncular chortles,2
156055,156056,8544,Hearst 's,2
156056,156057,8544,forced avuncular chortles,negatif
156057,156058,8544,avuncular chortles,pozitif
156058,156059,8544,avuncular,2
156059,156060,8544,chortles,2


In [8]:
data['Sentiment'].value_counts()

Sentiment
2          79582
pozitif    42133
negatif    34345
Name: count, dtype: int64

In [9]:
data = data[(data.Sentiment == "negatif")| (data.Sentiment == "pozitif")]

In [10]:
data.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,negatif
21,22,1,good for the goose,pozitif
22,23,1,good,pozitif
33,34,1,"the gander , some of which occasionally amuses...",negatif
46,47,1,amuses,pozitif


In [11]:
data.groupby("Sentiment").count()

Unnamed: 0_level_0,PhraseId,SentenceId,Phrase
Sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
negatif,34345,34345,34345
pozitif,42133,42133,42133


In [12]:
df = pd.DataFrame() 
df["text"] = data["Phrase"]
df["label"] = data["Sentiment"]

In [13]:
df.head()

Unnamed: 0,text,label
0,A series of escapades demonstrating the adage ...,negatif
21,good for the goose,pozitif
22,good,pozitif
33,"the gander , some of which occasionally amuses...",negatif
46,amuses,pozitif


#### Metin on isleme

In [15]:
# buyuk-kucuk donusumu 
df['text'] = df['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))

# noktalama isaretleri 
df['text'] = df['text'].str.replace(r"[^\w\s]", "", regex = True)

# sayilar
df["text"] = df["text"].str.replace(r"\d", "", regex = True)

#stopwords
import nltk
from nltk.corpus import stopwords
sw = stopwords.words('english')
df["text"] = df["text"].apply(lambda x: " ".join(x for x in x.split() if x not in sw ))

# seyreklerin silinmesi 
delete = pd.Series(' '.join(df['text']).split()).value_counts()[-1000:]
df['text'] = df['text'].apply(lambda x: " ".join( x for x in x.split() if x not in delete))

# lemmi 
from textblob import Word
df['text'] = df['text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))


#### Değişken Mühendisliği  (Feature Engineering)

* Count Vectors
* TF-IDF Vectors(words, characters, n-grams)
* Word Embeddings

TF(t) = (Bir t teriminin bir dökümanda gözlenme frekansı) / (dökümandaki toplam terim sayısı)

IDF(t) = log_e(Toplam döküman sayısı / içinde t terimi olan belge sayısı)


In [17]:
df.head()

Unnamed: 0,text,label
0,series demonstrating adage good goose also goo...,negatif
21,good goose,pozitif
22,good,pozitif
33,gander occasionally amuses none amount much story,negatif
46,amuses,pozitif


In [18]:
df.iloc[0]

text     series demonstrating adage good goose also goo...
label                                              negatif
Name: 0, dtype: object

#### Test-Train

In [20]:
train_x, test_x, train_y, test_y = model_selection.train_test_split(df['text'],
                                                                   df['label'],
                                                                   random_state=1)

In [21]:
train_x.head()

118788    present fascinating glimpse urban life class w...
89514     hey everybody wanna watch movie guy dressed ch...
86857                               incredible number story
140626                         ultimately disappoint action
153243                                              unified
Name: text, dtype: object

In [22]:
encoder = preprocessing.LabelEncoder()

In [23]:
train_y = encoder.fit_transform(train_y)
test_y = encoder.fit_transform(test_y)

In [24]:
train_y[0:5]

array([1, 0, 1, 0, 1])

In [25]:
test_y[0:5]

array([1, 0, 1, 0, 0])

#### Count Vectors

In [27]:
vectorizer = CountVectorizer()
vectorizer.fit(train_x)

In [28]:
x_train_count = vectorizer.transform(train_x)
x_test_count = vectorizer.transform(test_x)

In [29]:
#x_train_count.head()

In [30]:
vectorizer.get_feature_names_out()[0:5]

array(['aaa', 'aaliyah', 'abagnale', 'abandon', 'abandoned'], dtype=object)

In [31]:
x_train_count.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

#### TF-IDF

##### word level

In [35]:
tf_idf_word_vectorizer = TfidfVectorizer()
tf_idf_word_vectorizer.fit(train_x)

In [36]:
x_train_tf_idf_word = tf_idf_word_vectorizer.transform(train_x)
x_test_tf_idf_word = tf_idf_word_vectorizer.transform(test_x)

In [37]:
tf_idf_word_vectorizer.get_feature_names_out()[0:5]

array(['aaa', 'aaliyah', 'abagnale', 'abandon', 'abandoned'], dtype=object)

In [38]:
x_train_tf_idf_word.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

##### n-gram level

In [40]:
tf_idf_ngram_vectorizer = TfidfVectorizer(ngram_range = (2,3))
tf_idf_ngram_vectorizer.fit(train_x)

In [41]:
x_train_tf_idf_ngram = tf_idf_ngram_vectorizer.transform(train_x)
x_test_tf_idf_ngram = tf_idf_ngram_vectorizer.transform(test_x)

##### characters level

In [43]:
tf_idf_chars_vectorizer = TfidfVectorizer(analyzer= "char", ngram_range=(2,3))
tf_idf_chars_vectorizer.fit(train_x)

In [44]:
x_train_tf_idf_chars = tf_idf_chars_vectorizer.transform(train_x)
x_test_tf_idf_chars = tf_idf_chars_vectorizer.transform(test_x)

### Makine Öğrenmesi İle Sentiment Sınıflandırması 

#### Lojistik Regresyon

In [175]:
loj = linear_model.LogisticRegression()
loj_model = loj.fit(x_train_count,train_y)
accuracy = model_selection.cross_val_score(loj_model,
                                          x_test_count,
                                          test_y,
                                          cv=10).mean()
print("Count Vectors Değruluk Oranı:", accuracy)

Count Vectors Değruluk Oranı: 0.8369769874476989


In [48]:
loj = linear_model.LogisticRegression()
loj_model = loj.fit(x_train_tf_idf_word,train_y)
accuracy = model_selection.cross_val_score(loj_model,
                                          x_test_tf_idf_word,
                                          test_y,
                                          cv=10).mean()
print("Word-Level TF-IDF Değruluk Oranı:", accuracy)

Word-Level TF-IDF Değruluk Oranı: 0.8332112970711298


In [49]:
loj = linear_model.LogisticRegression()
loj_model = loj.fit(x_train_tf_idf_ngram, train_y)
accuracy = model_selection.cross_val_score(loj_model,
                                          x_test_tf_idf_ngram,
                                          test_y,
                                          cv=10).mean()
print("N-GRAM TF-IDF Değruluk Oranı:", accuracy)

N-GRAM TF-IDF Değruluk Oranı: 0.746652719665272


In [50]:
loj = linear_model.LogisticRegression()
loj_model = loj.fit(x_train_tf_idf_chars, train_y)
accuracy = model_selection.cross_val_score(loj_model,
                                          x_test_tf_idf_chars,
                                          test_y,
                                          cv=10).mean()
print("CHARLEVEL Değruluk Oranı:", accuracy)

CHARLEVEL Değruluk Oranı: 0.781171548117155


#### Naive Bayes

In [52]:
nb = naive_bayes.MultinomialNB()
nb_model = nb.fit(x_train_count, train_y)
accuracy = model_selection.cross_val_score(nb_model,
                                          x_test_count,
                                          test_y,
                                          cv=10).mean()
print("Count Vectors Değruluk Oranı:", accuracy)

Count Vectors Değruluk Oranı: 0.8332635983263599


In [53]:
nb = naive_bayes.MultinomialNB()
nb_model = nb.fit(x_train_tf_idf_word,train_y)
accuracy = model_selection.cross_val_score(nb_model,
                                          x_test_tf_idf_word,
                                          test_y,
                                          cv=10).mean()
print("Word-Level TF-IDF Değruluk Oranı:", accuracy)

Word-Level TF-IDF Değruluk Oranı: 0.8350941422594141


In [54]:
nb = naive_bayes.MultinomialNB()
nb_model = nb.fit(x_train_tf_idf_ngram, train_y)
accuracy = model_selection.cross_val_score(nb_model,
                                          x_test_tf_idf_ngram,
                                          test_y,
                                          cv=10).mean()
print("N-GRAM TF-IDF Değruluk Oranı:", accuracy)

N-GRAM TF-IDF Değruluk Oranı: 0.7685146443514643


In [55]:
nb = naive_bayes.MultinomialNB()
nb_model = nb.fit(x_train_tf_idf_chars, train_y)
accuracy = model_selection.cross_val_score(nb_model,
                                          x_test_tf_idf_chars,
                                          test_y,
                                          cv=10).mean()
print("CHARLEVEL Değruluk Oranı:", accuracy)

CHARLEVEL Değruluk Oranı: 0.7564853556485356


#### Random Forests

In [57]:
rf = ensemble.RandomForestClassifier()
rf_model = rf.fit(x_train_count,train_y)
accuracy = model_selection.cross_val_score(rf_model,
                                          x_test_count,
                                          test_y,
                                          cv=10).mean()
print("Count Vectors Değruluk Oranı:", accuracy)

Count Vectors Değruluk Oranı: 0.8218096234309623


In [98]:
rf = ensemble.RandomForestClassifier()
rf_model = rf.fit(x_train_tf_idf_word,train_y)
accuracy = model_selection.cross_val_score(rf_model,
                                          x_test_tf_idf_word,
                                          test_y,
                                          cv=10).mean()
print("Word-Level TF-IDF Değruluk Oranı:", accuracy)

Word-Level TF-IDF Değruluk Oranı: 0.824581589958159


In [102]:
rf = ensemble.RandomForestClassifier()
rf_model = rf.fit(x_train_tf_idf_ngram,train_y)
accuracy = model_selection.cross_val_score(rf_model,
                                          x_test_tf_idf_ngram,
                                          test_y,
                                          cv=10).mean()
print("N-GRAM TF-IDF Değruluk Oranı:", accuracy)

N-GRAM TF-IDF Değruluk Oranı: 0.7549163179916317


In [103]:
rf = ensemble.RandomForestClassifier()
rf_model = rf.fit(x_train_tf_idf_chars,train_y)
accuracy = model_selection.cross_val_score(rf_model,
                                          x_test_tf_idf_chars,
                                          test_y,
                                          cv=10).mean()
print("CHARLEVEL Değruluk Oranı:", accuracy)

CHARLEVEL Değruluk Oranı: 0.8139644351464435


#### XGBoost

In [179]:
from sklearn.model_selection import cross_val_score

xgb = xgboost.XGBClassifier()
xgb_model = xgb.fit(x_train_count, train_y)
accuracy = cross_val_score(xgb_model, x_test_count, test_y, cv=10).mean()
print(f"Count Vectors Değruluk Oranı: {accuracy}")


Count Vectors Değruluk Oranı: 0.7145397489539749


In [131]:
xgb = xgboost.XGBClassifier()
xgb_model = xgb.fit(x_train_tf_idf_word,train_y)
accuracy = cross_val_score(xgb_model, x_test_tf_idf_word, test_y, cv=10).mean()
print(f"Word-Level TF-IDF Değruluk Oranı: {accuracy}")

Word-Level TF-IDF Değruluk Oranı: 0.706276150627615


In [132]:
xgb = xgboost.XGBClassifier()
xgb_model = xgb.fit(x_train_tf_idf_ngram,train_y)
accuracy = cross_val_score(xgb_model, x_test_tf_idf_ngram, test_y, cv=10).mean()
print(f"N-GRAM TF-IDF Değruluk Oranı {accuracy}")

N-GRAM TF-IDF Değruluk Oranı 0.582479079497908


In [133]:
xgb = xgboost.XGBClassifier()
xgb_model = xgb.fit(x_train_tf_idf_chars,train_y)
accuracy = cross_val_score(xgb_model, x_test_tf_idf_chars, test_y, cv=10).mean()
print(f" CHARLEVEL Değruluk Oranı: {accuracy}")

 CHARLEVEL Değruluk Oranı: 0.7765167364016736


In [185]:
new_comment = ["this film is very nice and good i like it"]
new_comment = ["no not good look at that shit very bad"]

In [187]:
vectorizer = CountVectorizer()
vectorizer  = vectorizer.fit(train_x)
new_comment = vectorizer.transform(new_comment)


In [189]:
loj_model.predict(new_comment)

array([0])