In [None]:
import pandas as pd 
import numpy as np
import sys
import grpc
import zemberek_grpc.morphology_pb2 as z_morphology
import zemberek_grpc.morphology_pb2_grpc as z_morphology_g
import matplotlib.pyplot as plt
import re
from sklearn.metrics import classification_report
import string
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,TfidfTransformer
from sklearn.cluster import KMeans

df=pd.read_csv("news_with_label.csv",index_col=0,encoding="cp1254")
df.dropna(inplace=True,axis=0)

In [None]:
channel = grpc.insecure_channel('localhost:6789')
morphology_stub = z_morphology_g.MorphologyServiceStub(channel)

In [None]:
def analyze(i):
    response = morphology_stub.AnalyzeSentence(z_morphology.SentenceAnalysisRequest(input=i))
    return response

In [None]:
def fix_decode(text):
    """Pass decode."""
    if sys.version_info < (3, 0):
        return text.decode('cp1254')
    else:
        return text

In [None]:
notr=df[df.label==0]
pos=df[df.label==1]
neg=df[df.label==-1]
print(f" Pozitif Label Len :{len(pos)} \n Negatif Label Len :{len(neg)} \n Notr Label Len    :{len(notr)}")

In [None]:
def zemberek(text):
    clean=[]
    analysis_input = text
    print('Analysis result for input : ' + fix_decode(analysis_input))
    analysis_result = analyze(analysis_input)
    for a in analysis_result.results:
        best = a.best
        lemmas = best.lemmas[0]
        if lemmas!="UNK":
#             print("Lemmas = " + lemmas)
            clean.append(lemmas)
        elif lemmas == "UNK":
            clean.append(a.token)
    clean = " ".join(clean)
    return clean

In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub("\[.*?\]", "", text)  
    text = re.sub("[%s]" % re.escape(string.punctuation), "", text)
    text = re.sub("\w*\d\w*", "", text)
    text = re.sub('\n', '', text)
    print(text)

    return text

In [None]:
df_x=df.content.astype(str)
df_y=df.label.astype("int")


x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2,random_state=44)

In [None]:
pipeline = Pipeline([
    ('vect', CountVectorizer(analyzer ='word',ngram_range=(1,6),)),
    ('tfidf', TfidfTransformer()),
])

In [None]:
x_train_vect=pipeline.fit_transform(x_train)
x_test_vect=pipeline.transform(x_test)

In [None]:
mnb = MultinomialNB(alpha=0.01) # Çok sınıflı problemlerde kullanılır.
knn = KNeighborsClassifier(3)
svc = LinearSVC(random_state=44)
rfc = RandomForestClassifier(n_estimators=10,criterion='entropy',)#
tree = DecisionTreeClassifier()
sgd = SGDClassifier(alpha=0.01,random_state=44)
kmeans = KMeans(n_clusters=3, random_state=0)

In [None]:
# models=[mnb]
models=[mnb,svc,rfc,tree,sgd,knn]
best_acc=0
selected=models[0]
for count,model in enumerate(models):
    try:
        model=model.fit(x_train_vect,y_train)
        accuracy=accuracy_score(y_test,model.predict(x_test_vect))
        print(f"Model : {models[count]} \n Accuracy : {accuracy} \n")
        if accuracy>best_acc:
            best_acc=accuracy
            selected=model
    except Exception:
        continue
print("Best Accuracy :" + str(best_acc))
print("Selected Model : " + str(selected))

In [None]:
def make_predict(text):
    text=zemberek(text)
    text=clean_text(text)
    text = pipeline.transform([text])
    pred = selected.predict(text)
    print(text)
    print(pred)

In [None]:
text="""
sat
"""

make_predict(text)

In [None]:
y_predict=selected.predict(x_test_vect)
y_actual=np.array(y_test)

In [None]:
plt.figure(figsize=(10,8))
cm = confusion_matrix(y_actual, y_predict)
class_label = ["-1", "0","1"]
df_cm = pd.DataFrame(cm, index=class_label,columns=class_label)
sns.heatmap(df_cm, annot=True, fmt='d')
plt.rcParams.update({'font.size': 30})
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

In [None]:
# from nltk.probability import FreqDist
# all_words=[]
# for text in df.content:
#     for i in text.split():
#         all_words.append(i)
        

In [None]:
# fdist=FreqDist(word for word in all_words)

In [None]:
# fdist.most_common(5)

In [None]:
# plt.figure(figsize=(24,16))
# plt.scatter(range(len(y_predict)),y_predict)
# plt.scatter(range(len(y_predict)),y_actual)



In [None]:
print(classification_report(y_actual, y_predict, labels=[-1,0,1]))