In [41]:
import warnings
warnings.filterwarnings('ignore')

In [42]:
# Corpus reader:
import os
root = './Confs_newline/Conf2/'
from nltk.corpus.reader import CategorizedPlaintextCorpusReader
reader = CategorizedPlaintextCorpusReader(root, r'.*\.txt', cat_pattern=r'(\w+)/*', encoding='utf-8')

In [43]:
print(reader.categories())
print(reader.fileids())

['kiz', 'kork', 'mutlu', 'notr', 'uzul']
['kiz.txt', 'kork.txt', 'mutlu.txt', 'notr.txt', 'uzul.txt']


In [44]:
def sent_tokenize_whole_tweets(text): # raw text --> whole tweets file content
    sentences= []
    for line in text.split('\n'):
        line= line.strip()
        sentences.append(line)
    return sentences

In [47]:
all_text=[]
labels= []

In [48]:
for label,file_name in zip(reader.categories(), reader.fileids()):
    sentences= sent_tokenize_whole_tweets(reader.raw(file_name)) # --> this should return a list of contents
    labels.extend([label for i in sentences])
    all_text.extend([i for i in sentences])
print(len(labels))
print(len(all_text))
# Now, we have all tweets in all_text list!

3316
3316


In [49]:
all_text[3315]

'hayat ben küs ben hayat küs hatırlaneg ama bu arala fena dargın hayat'

In [13]:
# WITH UNIGRAM COUNT VECTORIZER:
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

from sklearn.model_selection import StratifiedKFold 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
# The folds are made by preserving the percentage of samples for each class.
# use the original data, all_text!

list_classifier= [MultinomialNB(), LinearSVC()]

for clf in list_classifier:
    print(clf)
    skf = StratifiedKFold(n_splits=5,shuffle=True, random_state=123)
    all_text= np.array(all_text)
    labels= np.array(labels)
    scores= []
    i= 1
    for train_index, test_index in skf.split(all_text, labels):
        X_train, y_train = all_text[train_index], labels[train_index] 
        X_test, y_test = all_text[test_index], labels[test_index]

        train_vectorizer = CountVectorizer()
        X_train = train_vectorizer.fit_transform(X_train)
        train_vocab= train_vectorizer.vocabulary_   
        test_vectorizer = CountVectorizer(vocabulary=train_vocab)
        X_test = test_vectorizer.fit_transform(X_test)
        clf.fit(X_train, y_train)
        sc= accuracy_score(y_test, clf.predict(X_test))
        scores.append(sc)
        print("fold "+ str(i)+ " accuracy: "+ str(sc))
        i+= 1
    print("Mean score: "+ str(np.mean(scores)))
    print(" ")

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
fold 1 accuracy: 0.8943089430894309
fold 2 accuracy: 0.8943089430894309
fold 3 accuracy: 0.9073170731707317
fold 4 accuracy: 0.8972267536704731
fold 5 accuracy: 0.8972267536704731
Mean score: 0.8980776933381079
 
LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
fold 1 accuracy: 0.9170731707317074
fold 2 accuracy: 0.9382113821138212
fold 3 accuracy: 0.9317073170731708
fold 4 accuracy: 0.9314845024469821
fold 5 accuracy: 0.9298531810766721
Mean score: 0.9296659106884707
 


In [16]:
# WITH UNIGRAM TF VECTORIZER:
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

from sklearn.model_selection import StratifiedKFold 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
# The folds are made by preserving the percentage of samples for each class.
# use the original data, all_text!

from sklearn.feature_extraction.text import TfidfTransformer

list_classifier= [MultinomialNB(), LinearSVC()]

for clf in list_classifier:
    print(clf)
    skf = StratifiedKFold(n_splits=5,shuffle=True, random_state=123)
    all_text= np.array(all_text)
    labels= np.array(labels)
    scores= []
    i= 1
    for train_index, test_index in skf.split(all_text, labels):
        X_train, y_train = all_text[train_index], labels[train_index] 
        X_test, y_test = all_text[test_index], labels[test_index]

        train_vectorizer = CountVectorizer()
        X_train = train_vectorizer.fit_transform(X_train)
        transformer_tf= TfidfTransformer(use_idf= False, norm= 'l2') # if normalize None, it is the same as CountVect
        # l2 normalization: like percentage of values
        X_train= transformer_tf.fit_transform(X_train)
           
        train_vocab= train_vectorizer.vocabulary_   
        test_vectorizer = CountVectorizer(vocabulary=train_vocab)
        X_test = test_vectorizer.fit_transform(X_test)
        X_test= transformer_tf.fit_transform(X_test)
        
        clf.fit(X_train, y_train)
        sc= accuracy_score(y_test, clf.predict(X_test))
        scores.append(sc)
        print("fold "+ str(i)+ " accuracy: "+ str(sc))
        i+= 1
    print("Mean score: "+ str(np.mean(scores)))
    print(" ")



MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
fold 1 accuracy: 0.8617886178861789
fold 2 accuracy: 0.865040650406504
fold 3 accuracy: 0.8666666666666667
fold 4 accuracy: 0.8564437194127243
fold 5 accuracy: 0.8548123980424144
Mean score: 0.8609504104828977
 
LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
fold 1 accuracy: 0.926829268292683
fold 2 accuracy: 0.9398373983739837
fold 3 accuracy: 0.9300813008130081
fold 4 accuracy: 0.9347471451876019
fold 5 accuracy: 0.9298531810766721
Mean score: 0.9322696587487899
 


In [17]:
# WITH UNIGRAM TF-idf VECTORIZER:
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

from sklearn.model_selection import StratifiedKFold 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
# The folds are made by preserving the percentage of samples for each class.
# use the original data, all_text!


from sklearn.feature_extraction.text import TfidfTransformer

list_classifier= [MultinomialNB(), LinearSVC()]

for clf in list_classifier:
    print(clf)
    skf = StratifiedKFold(n_splits=5,shuffle=True, random_state=123)
    all_text= np.array(all_text)
    labels= np.array(labels)
    scores= []
    i= 1
    for train_index, test_index in skf.split(all_text, labels):
        X_train, y_train = all_text[train_index], labels[train_index] 
        X_test, y_test = all_text[test_index], labels[test_index]

        train_vectorizer = CountVectorizer()
        X_train = train_vectorizer.fit_transform(X_train)
        transformer_tf= TfidfTransformer(use_idf= True, norm= 'l2', smooth_idf=True) 
        # if normalize None, it is the same as CountVect
        # l2 normalization: like percentage of values
        X_train= transformer_tf.fit_transform(X_train)
           
        train_vocab= train_vectorizer.vocabulary_   
        test_vectorizer = CountVectorizer(vocabulary=train_vocab)
        X_test = test_vectorizer.fit_transform(X_test)
        X_test= transformer_tf.fit_transform(X_test)
        
        clf.fit(X_train, y_train)
        sc= accuracy_score(y_test, clf.predict(X_test))
        scores.append(sc)
        print("fold "+ str(i)+ " accuracy: "+ str(sc))
        i+= 1
    print("Mean score: "+ str(np.mean(scores)))
    print(" ")



MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
fold 1 accuracy: 0.8552845528455284
fold 2 accuracy: 0.865040650406504
fold 3 accuracy: 0.8796747967479674
fold 4 accuracy: 0.8548123980424144
fold 5 accuracy: 0.8597063621533442
Mean score: 0.8629037520391517
 
LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
fold 1 accuracy: 0.9252032520325203
fold 2 accuracy: 0.9414634146341463
fold 3 accuracy: 0.9317073170731708
fold 4 accuracy: 0.9363784665579119
fold 5 accuracy: 0.9249592169657422
Mean score: 0.9319423334526983
 


### Look at the following small examples to ensure that everything works as expected:

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
>>> corpus = [
...     'This is the first document.',
...     'This document is the second document.',
...     'And this is the third one.',
...     'Is this the first document?',
... ]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())
print(X.toarray().shape)  

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer_tf= TfidfTransformer(use_idf= False, norm=None)

X_tf= transformer_tf.fit_transform(X)
print(X_tf.toarray())  

In [None]:
transformer_tfidf= TfidfTransformer(use_idf= True, norm=None)

X_tfidf= transformer_tfidf.fit_transform(X)
print(X_tfidf.toarray()) 

In [None]:
import numpy as np
print((X.toarray()).shape[0])
print((X.toarray()).shape[1])
X_binary= np.zeros(((X.toarray()).shape[0],(X.toarray()).shape[1]))
print(X_binary.shape)
for i in range(len(X.toarray())):
    for j in range(len(X.toarray()[i])):
        if X.toarray()[i,j]>0:
            X_binary[i,j]= True
        else:
            X_binary[i,j]= False
print(X_binary)