In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Corpus reader:
import os
root = './Confs_newline/Conf2/'
from nltk.corpus.reader import CategorizedPlaintextCorpusReader
reader = CategorizedPlaintextCorpusReader(root, r'.*\.txt', cat_pattern=r'(\w+)/*', encoding='utf-8')

In [3]:
print(reader.categories())
print(reader.fileids())

['kiz', 'kork', 'mutlu', 'notr', 'uzul']
['kiz.txt', 'kork.txt', 'mutlu.txt', 'notr.txt', 'uzul.txt']


In [4]:
def sent_tokenize_whole_tweets(text): # raw text --> whole tweets file content
    sentences= []
    for line in text.split('\n'):
        line= line.strip()
        sentences.append(line)
    return sentences

In [5]:
all_text=[]
labels= []

In [6]:
for label,file_name in zip(reader.categories(), reader.fileids()):
    sentences= sent_tokenize_whole_tweets(reader.raw(file_name)) # --> this should return a list of contents
    labels.extend([label for i in sentences])
    all_text.extend([i for i in sentences])
print(len(labels))
print(len(all_text))
# Now, we have all tweets in all_text list!

3316
3316


In [7]:
all_text[3315]

'hayat ben küs ben hayat küs hatırlaneg ama bu arala fena dargın hayat'

In [8]:
# WITH UNIGRAM COUNT VECTORIZER:
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

from sklearn.model_selection import StratifiedKFold 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
# The folds are made by preserving the percentage of samples for each class.
# use the original data, all_text!

list_classifier= [MultinomialNB(), LinearSVC()]

for clf in list_classifier:
    print(clf)
    skf = StratifiedKFold(n_splits=5,shuffle=True, random_state=123)
    all_text= np.array(all_text)
    labels= np.array(labels)
    scores= []
    i= 1
    for train_index, test_index in skf.split(all_text, labels):
        X_train, y_train = all_text[train_index], labels[train_index] 
        X_test, y_test = all_text[test_index], labels[test_index]

        train_vectorizer = CountVectorizer()
        X_train = train_vectorizer.fit_transform(X_train)
        train_vocab= train_vectorizer.vocabulary_   
        test_vectorizer = CountVectorizer(vocabulary=train_vocab)
        X_test = test_vectorizer.fit_transform(X_test)
        clf.fit(X_train, y_train)
        sc= accuracy_score(y_test, clf.predict(X_test))
        scores.append(sc)
        print("fold "+ str(i)+ " accuracy: "+ str(sc))
        i+= 1
    print("Mean score: "+ str(np.mean(scores)))
    print(" ")

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
fold 1 accuracy: 0.8345864661654135
fold 2 accuracy: 0.8463855421686747
fold 3 accuracy: 0.8491704374057315
fold 4 accuracy: 0.8235294117647058
fold 5 accuracy: 0.8350983358547656
Mean score: 0.8377540386718583
 
LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
fold 1 accuracy: 0.8661654135338346
fold 2 accuracy: 0.8780120481927711
fold 3 accuracy: 0.8778280542986425
fold 4 accuracy: 0.8778280542986425
fold 5 accuracy: 0.8774583963691377
Mean score: 0.8754583933386056
 


In [9]:
# WITH UNIGRAM TF VECTORIZER:
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

from sklearn.model_selection import StratifiedKFold 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
# The folds are made by preserving the percentage of samples for each class.
# use the original data, all_text!

from sklearn.feature_extraction.text import TfidfTransformer

list_classifier= [MultinomialNB(), LinearSVC()]

for clf in list_classifier:
    print(clf)
    skf = StratifiedKFold(n_splits=5,shuffle=True, random_state=123)
    all_text= np.array(all_text)
    labels= np.array(labels)
    scores= []
    i= 1
    for train_index, test_index in skf.split(all_text, labels):
        X_train, y_train = all_text[train_index], labels[train_index] 
        X_test, y_test = all_text[test_index], labels[test_index]

        train_vectorizer = CountVectorizer()
        X_train = train_vectorizer.fit_transform(X_train)
        transformer_tf= TfidfTransformer(use_idf= False, norm= 'l2') # if normalize None, it is the same as CountVect
        # l2 normalization: like percentage of values
        X_train= transformer_tf.fit_transform(X_train)
           
        train_vocab= train_vectorizer.vocabulary_   
        test_vectorizer = CountVectorizer(vocabulary=train_vocab)
        X_test = test_vectorizer.fit_transform(X_test)
        X_test= transformer_tf.fit_transform(X_test)
        
        clf.fit(X_train, y_train)
        sc= accuracy_score(y_test, clf.predict(X_test))
        scores.append(sc)
        print("fold "+ str(i)+ " accuracy: "+ str(sc))
        i+= 1
    print("Mean score: "+ str(np.mean(scores)))
    print(" ")



MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
fold 1 accuracy: 0.8
fold 2 accuracy: 0.7966867469879518
fold 3 accuracy: 0.8054298642533937
fold 4 accuracy: 0.77526395173454
fold 5 accuracy: 0.81089258698941
Mean score: 0.7976546299930591
 
LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
fold 1 accuracy: 0.8857142857142857
fold 2 accuracy: 0.8855421686746988
fold 3 accuracy: 0.8868778280542986
fold 4 accuracy: 0.8883861236802413
fold 5 accuracy: 0.8835098335854765
Mean score: 0.8860060479418002
 


In [10]:
# WITH UNIGRAM TF-idf VECTORIZER:
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

from sklearn.model_selection import StratifiedKFold 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
# The folds are made by preserving the percentage of samples for each class.
# use the original data, all_text!


from sklearn.feature_extraction.text import TfidfTransformer

list_classifier= [MultinomialNB(), LinearSVC()]

for clf in list_classifier:
    print(clf)
    skf = StratifiedKFold(n_splits=5,shuffle=True, random_state=123)
    all_text= np.array(all_text)
    labels= np.array(labels)
    scores= []
    i= 1
    for train_index, test_index in skf.split(all_text, labels):
        X_train, y_train = all_text[train_index], labels[train_index] 
        X_test, y_test = all_text[test_index], labels[test_index]

        train_vectorizer = CountVectorizer()
        X_train = train_vectorizer.fit_transform(X_train)
        transformer_tf= TfidfTransformer(use_idf= True, norm= 'l2', smooth_idf=True) 
        # if normalize None, it is the same as CountVect
        # l2 normalization: like percentage of values
        X_train= transformer_tf.fit_transform(X_train)
           
        train_vocab= train_vectorizer.vocabulary_   
        test_vectorizer = CountVectorizer(vocabulary=train_vocab)
        X_test = test_vectorizer.fit_transform(X_test)
        X_test= transformer_tf.fit_transform(X_test)
        
        clf.fit(X_train, y_train)
        sc= accuracy_score(y_test, clf.predict(X_test))
        scores.append(sc)
        print("fold "+ str(i)+ " accuracy: "+ str(sc))
        i+= 1
    print("Mean score: "+ str(np.mean(scores)))
    print(" ")



MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
fold 1 accuracy: 0.8
fold 2 accuracy: 0.8012048192771084
fold 3 accuracy: 0.8069381598793364
fold 4 accuracy: 0.7782805429864253
fold 5 accuracy: 0.8093797276853253
Mean score: 0.799160649965639
 
LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
fold 1 accuracy: 0.8872180451127819
fold 2 accuracy: 0.8855421686746988
fold 3 accuracy: 0.8838612368024132
fold 4 accuracy: 0.8838612368024132
fold 5 accuracy: 0.8789712556732224
Mean score: 0.8838907886131059
 
