# text classification

## gerekli kütüphaneleri çağırma işlemi

In [1]:
import os
import pandas as pd
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import chi2 , f_classif 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_validate
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report 


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ALAAEDDIN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## veri okuma ve bir sınıfa atma işlemleri

In [2]:
path_1 = "C:/Users/ALAAEDDIN/PycharmProjects/pythonProject2/raw_texts/1"
path_2 = "C:/Users/ALAAEDDIN/PycharmProjects/pythonProject2/raw_texts/2"
path_3 = "C:/Users/ALAAEDDIN/PycharmProjects/pythonProject2/raw_texts/3"

def read_txt_files(path, class_name):
    tweets_list = []
    for f_name in os.listdir(path):
        if f_name.endswith(".txt"):
            with open(os.path.join(path, f_name), "r", encoding="ISO-8859-9") as f:
                tweet = f.read()
                tweets_list.append([tweet, class_name])
                tweets_df = pd.DataFrame(tweets_list, columns=["Tweet", "Class"])
    return tweets_df

df_class1 = read_txt_files(path_1, "positive")
df_class2 = read_txt_files(path_2, "negative")
df_class3 = read_txt_files(path_3, "neutral")
df = pd.concat([df_class1, df_class2, df_class3])

In [3]:
df.head()

Unnamed: 0,Tweet,Class
0,dun Turkcelle tepkilerimizden sonra bugün Turk...,positive
1,girmezmiyim.. Turkcell kartim bile var.. Yarin...,positive
2,tam tünelden gecerken 3g cekiyordu:D türkcell'...,positive
3,turkcell superonline fiber internet veya ADSL ...,positive
4,bence Gnçtrkcll Ark Winterfest 2012'de 1.olur ...,positive


In [4]:
df.tail()

Unnamed: 0,Tweet,Class
952,Evet kesinlikle çok az.. Turkcell bu konuda şu...,neutral
953,selocan nedir abicim ya ?,neutral
954,özturkcell e dönücz diye korkuyorum ama çanakk...,neutral
955,turkcell se 532 yi ara puk kodunu ogrenme menu...,neutral
956,CHIP Galaxy Y ve Chat Türkiye'de!: Samsung'un ...,neutral


In [5]:
df = df.reset_index(drop = True)

In [6]:
df.tail()

Unnamed: 0,Tweet,Class
2995,Evet kesinlikle çok az.. Turkcell bu konuda şu...,neutral
2996,selocan nedir abicim ya ?,neutral
2997,özturkcell e dönücz diye korkuyorum ama çanakk...,neutral
2998,turkcell se 532 yi ara puk kodunu ogrenme menu...,neutral
2999,CHIP Galaxy Y ve Chat Türkiye'de!: Samsung'un ...,neutral


In [7]:
df.shape

(3000, 2)

In [8]:
df["Class"].value_counts()

negative    1287
neutral      957
positive     756
Name: Class, dtype: int64

## verileri temizleme işlemi

büyük harften küçüğe çevirmek

In [9]:
df["result"] = df["Tweet"].str.lower()
display(df)

Unnamed: 0,Tweet,Class,result
0,dun Turkcelle tepkilerimizden sonra bugün Turk...,positive,dun turkcelle tepkilerimizden sonra bugün turk...
1,girmezmiyim.. Turkcell kartim bile var.. Yarin...,positive,girmezmiyim.. turkcell kartim bile var.. yarin...
2,tam tünelden gecerken 3g cekiyordu:D türkcell'...,positive,tam tünelden gecerken 3g cekiyordu:d türkcell'...
3,turkcell superonline fiber internet veya ADSL ...,positive,turkcell superonline fiber internet veya adsl ...
4,bence Gnçtrkcll Ark Winterfest 2012'de 1.olur ...,positive,bence gnçtrkcll ark winterfest 2012'de 1.olur ...
...,...,...,...
2995,Evet kesinlikle çok az.. Turkcell bu konuda şu...,neutral,evet kesinlikle çok az.. turkcell bu konuda şu...
2996,selocan nedir abicim ya ?,neutral,selocan nedir abicim ya ?
2997,özturkcell e dönücz diye korkuyorum ama çanakk...,neutral,özturkcell e dönücz diye korkuyorum ama çanakk...
2998,turkcell se 532 yi ara puk kodunu ogrenme menu...,neutral,turkcell se 532 yi ara puk kodunu ogrenme menu...


kelimenin sonunda 2 den fazla tekrarlana herf silme işlemi

In [10]:
def delete_repet_char(st):
    return re.sub(r"(.)\1\1+",r"\1\1", st)
df["result"] = df["result"].apply(lambda x: delete_repet_char(x))

In [11]:
df.head()

Unnamed: 0,Tweet,Class,result
0,dun Turkcelle tepkilerimizden sonra bugün Turk...,positive,dun turkcelle tepkilerimizden sonra bugün turk...
1,girmezmiyim.. Turkcell kartim bile var.. Yarin...,positive,girmezmiyim.. turkcell kartim bile var.. yarin...
2,tam tünelden gecerken 3g cekiyordu:D türkcell'...,positive,tam tünelden gecerken 3g cekiyordu:d türkcell'...
3,turkcell superonline fiber internet veya ADSL ...,positive,turkcell superonline fiber internet veya adsl ...
4,bence Gnçtrkcll Ark Winterfest 2012'de 1.olur ...,positive,bence gnçtrkcll ark winterfest 2012'de 1.olur ...


In [12]:
# veriler hasthag veya menthion veya linkler içermediği için bunlara temizlemek gerekmiyor ama olursa temizlememiz daha doğru olur

In [13]:
# harf veya rakam olmayan her şey sileriz şu şekildedir

In [14]:
import string

def delete_punctuation(st):
    punctuations = string.punctuation
    no_punct = ""
    for char in st:
        if char not in punctuations:
            no_punct += char

    return no_punct

df["result"] = df["result"].apply(lambda x: delete_punctuation(x))

In [15]:
df.tail()

Unnamed: 0,Tweet,Class,result
2995,Evet kesinlikle çok az.. Turkcell bu konuda şu...,neutral,evet kesinlikle çok az turkcell bu konuda şu a...
2996,selocan nedir abicim ya ?,neutral,selocan nedir abicim ya
2997,özturkcell e dönücz diye korkuyorum ama çanakk...,neutral,özturkcell e dönücz diye korkuyorum ama çanakk...
2998,turkcell se 532 yi ara puk kodunu ogrenme menu...,neutral,turkcell se 532 yi ara puk kodunu ogrenme menu...
2999,CHIP Galaxy Y ve Chat Türkiye'de!: Samsung'un ...,neutral,chip galaxy y ve chat türkiyede samsungun komp...


In [16]:
df.head()

Unnamed: 0,Tweet,Class,result
0,dun Turkcelle tepkilerimizden sonra bugün Turk...,positive,dun turkcelle tepkilerimizden sonra bugün turk...
1,girmezmiyim.. Turkcell kartim bile var.. Yarin...,positive,girmezmiyim turkcell kartim bile var yarindan ...
2,tam tünelden gecerken 3g cekiyordu:D türkcell'...,positive,tam tünelden gecerken 3g cekiyordud türkcellin...
3,turkcell superonline fiber internet veya ADSL ...,positive,turkcell superonline fiber internet veya adsl ...
4,bence Gnçtrkcll Ark Winterfest 2012'de 1.olur ...,positive,bence gnçtrkcll ark winterfest 2012de 1olur da...


## tokenization

In [17]:
df["result"] = df["result"].apply(lambda x :word_tokenize(x))

In [18]:
df.head()

Unnamed: 0,Tweet,Class,result
0,dun Turkcelle tepkilerimizden sonra bugün Turk...,positive,"[dun, turkcelle, tepkilerimizden, sonra, bugün..."
1,girmezmiyim.. Turkcell kartim bile var.. Yarin...,positive,"[girmezmiyim, turkcell, kartim, bile, var, yar..."
2,tam tünelden gecerken 3g cekiyordu:D türkcell'...,positive,"[tam, tünelden, gecerken, 3g, cekiyordud, türk..."
3,turkcell superonline fiber internet veya ADSL ...,positive,"[turkcell, superonline, fiber, internet, veya,..."
4,bence Gnçtrkcll Ark Winterfest 2012'de 1.olur ...,positive,"[bence, gnçtrkcll, ark, winterfest, 2012de, 1o..."


## STOP WORD SİLMEK

In [19]:
def read_txt_file_to_list(path, name):
    list = []
    with open(os.path.join(path, name), "r", encoding="utf-8") as f:
        for i in f:
            list.append(i.strip())
    return list
stop_word_1 = read_txt_file_to_list("C:/Users/ALAAEDDIN/PycharmProjects/pythonProject2/raw_texts", "stop-words_turkish_1_tr.txt")
stop_word_1 = stop_word_1[3:]
df["result"] = df["result"].apply(lambda x: [item for item in x if item not in stop_word_1])


In [20]:
df.head()

Unnamed: 0,Tweet,Class,result
0,dun Turkcelle tepkilerimizden sonra bugün Turk...,positive,"[dun, turkcelle, tepkilerimizden, sonra, bugün..."
1,girmezmiyim.. Turkcell kartim bile var.. Yarin...,positive,"[girmezmiyim, turkcell, kartim, yarindan, taki..."
2,tam tünelden gecerken 3g cekiyordu:D türkcell'...,positive,"[tam, tünelden, gecerken, 3g, cekiyordud, türk..."
3,turkcell superonline fiber internet veya ADSL ...,positive,"[turkcell, superonline, fiber, internet, adsl,..."
4,bence Gnçtrkcll Ark Winterfest 2012'de 1.olur ...,positive,"[bence, gnçtrkcll, ark, winterfest, 2012de, 1o..."


## TD-IDF İŞLEMLERİ

In [21]:
X = df["result"]
y = df["Class"]

VecModel = TfidfVectorizer()
X_Vec = VecModel.fit_transform(X.apply(lambda x: " ".join(x)).values)
X_Vec = pd.DataFrame.sparse.from_spmatrix(X_Vec)

print(f'The new shape for X is {X_Vec.shape}')
X_Vec.head()


The new shape for X is (3000, 12237)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12227,12228,12229,12230,12231,12232,12233,12234,12235,12236
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## elde ettiğimiz yeni tabloyu bir csv olarak kayıdetme ve sınıflara binary encoding yapmak

In [22]:
tablo = X_Vec
tablo["Class"] = y
encoder = LabelEncoder()
tablo["Class"] = tablo["Class"].apply(lambda x : 0 if x == "positive" else 1 if x=="negative" else 2)
tablo.tail()
# tablo.to_csv("C:/Users/ALAAEDDIN/PycharmProjects/pythonProject2/raw_texts/son_tablo.csv", index=False)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12228,12229,12230,12231,12232,12233,12234,12235,12236,Class
2995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
2996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
2997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
2998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
2999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2


## Feature seçmek / azaltmak (chi kare yöntemi kullandım başka da olabilir ve feature 40% aldım 

In [23]:
y = tablo["Class"]
FeatureSelection = SelectPercentile(score_func = chi2, percentile=40 )
X = FeatureSelection.fit_transform(X_Vec, y)



## train data and test data ayırma 20% 80% olarak

In [24]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
# Model

## Model kurma işlemi (K-NN algoritmasını kullanarak)

In [26]:
model = KNeighborsClassifier()

In [27]:
model.fit(x_train, y_train)

## 10-fold cross valdation uygulama

In [28]:
results_train = cross_validate(model, x_train, y_train, cv=10, scoring=["accuracy"])

In [29]:

results_train["test_accuracy"].mean()

1.0

In [30]:
results_test = cross_validate(model, x_test, y_test, cv=10, scoring=["accuracy"])

In [31]:
results_test["test_accuracy"].mean()

1.0

In [32]:
y_pred = model.predict(x_test)

## değerlendirme sonuç raporu yazdırma ve txt olarak kayıdetme

In [33]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       166
           1       1.00      1.00      1.00       256
           2       1.00      1.00      1.00       178

    accuracy                           1.00       600
   macro avg       1.00      1.00      1.00       600
weighted avg       1.00      1.00      1.00       600



In [34]:
report = classification_report(y_test, y_pred, digits=4)

with open("report.txt", mode="w", encoding="utf-8") as file:
    file.write(report)