In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Corpus reader:
import os
root = './Confs/Conf2/'
from nltk.corpus.reader import CategorizedPlaintextCorpusReader
reader = CategorizedPlaintextCorpusReader(root, r'.*\.txt', cat_pattern=r'(\w+)/*', encoding='utf-8')

In [3]:
print(reader.categories())
print(reader.fileids())

['kiz', 'kork', 'mutlu', 'uzul']
['kiz.txt', 'kork.txt', 'mutlu.txt', 'uzul.txt']


In [4]:
def sent_tokenize_whole_tweets(text): # raw text --> whole tweets file content
    sents = text.split('|') 
    for s in sents:
        s= s.strip() # removes whitespaces in both end
    return sents

In [5]:
all_text=[]
labels= []
for label,file_name in zip(reader.categories(), reader.fileids()):
    sentences= sent_tokenize_whole_tweets(reader.raw(file_name)) # --> this should return a list of contents
    labels.extend([label for i in sentences])
    all_text.extend([i for i in sentences])
print(len(labels))
print(len(all_text))
# Now, we have all tweets in all_text list!

3071
3071


In [6]:
all_text[10]

' evet çok sev hem de gül bir durum görneg sine çok kız '

In [None]:
# WITH UNIGRAM COUNT VECTORIZER:
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

from sklearn.model_selection import StratifiedKFold 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
# The folds are made by preserving the percentage of samples for each class.
# use the original data, all_text!

list_classifier= [MultinomialNB(), LinearSVC()]

for clf in list_classifier:
    print(clf)
    skf = StratifiedKFold(n_splits=5,shuffle=True, random_state=123)
    all_text= np.array(all_text)
    labels= np.array(labels)
    scores= []
    i= 1
    for train_index, test_index in skf.split(all_text, labels):
        X_train, y_train = all_text[train_index], labels[train_index] 
        X_test, y_test = all_text[test_index], labels[test_index]

        train_vectorizer = CountVectorizer()
        X_train = train_vectorizer.fit_transform(X_train)
        train_vocab= train_vectorizer.vocabulary_   
        test_vectorizer = CountVectorizer(vocabulary=train_vocab)
        X_test = test_vectorizer.fit_transform(X_test)
        clf.fit(X_train, y_train)
        sc= accuracy_score(y_test, clf.predict(X_test))
        scores.append(sc)
        print("fold "+ str(i)+ " accuracy: "+ str(sc))
        i+= 1
    print("Mean score: "+ str(np.mean(scores)))
    print(" ")

In [None]:
# WITH UNIGRAM TF VECTORIZER:
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

from sklearn.model_selection import StratifiedKFold 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
# The folds are made by preserving the percentage of samples for each class.
# use the original data, all_text!


from sklearn.feature_extraction.text import TfidfTransformer

list_classifier= [MultinomialNB(), LinearSVC()]

for clf in list_classifier:
    print(clf)
    skf = StratifiedKFold(n_splits=5,shuffle=True, random_state=123)
    all_text= np.array(all_text)
    labels= np.array(labels)
    scores= []
    i= 1
    for train_index, test_index in skf.split(all_text, labels):
        X_train, y_train = all_text[train_index], labels[train_index] 
        X_test, y_test = all_text[test_index], labels[test_index]

        train_vectorizer = CountVectorizer()
        X_train = train_vectorizer.fit_transform(X_train)
        transformer_tf= TfidfTransformer(use_idf= False, norm= 'l2') # if normalize None, it is the same as CountVect
        # l2 normalization: like percentage of values
        X_train= transformer_tf.fit_transform(X_train)
           
        train_vocab= train_vectorizer.vocabulary_   
        test_vectorizer = CountVectorizer(vocabulary=train_vocab)
        X_test = test_vectorizer.fit_transform(X_test)
        X_test= transformer_tf.fit_transform(X_test)
        
        clf.fit(X_train, y_train)
        sc= accuracy_score(y_test, clf.predict(X_test))
        scores.append(sc)
        print("fold "+ str(i)+ " accuracy: "+ str(sc))
        i+= 1
    print("Mean score: "+ str(np.mean(scores)))
    print(" ")



In [None]:
# WITH UNIGRAM TF-idf VECTORIZER:
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

from sklearn.model_selection import StratifiedKFold 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
# The folds are made by preserving the percentage of samples for each class.
# use the original data, all_text!


from sklearn.feature_extraction.text import TfidfTransformer

list_classifier= [MultinomialNB(), LinearSVC()]

for clf in list_classifier:
    print(clf)
    skf = StratifiedKFold(n_splits=5,shuffle=True, random_state=123)
    all_text= np.array(all_text)
    labels= np.array(labels)
    scores= []
    i= 1
    for train_index, test_index in skf.split(all_text, labels):
        X_train, y_train = all_text[train_index], labels[train_index] 
        X_test, y_test = all_text[test_index], labels[test_index]

        train_vectorizer = CountVectorizer()
        X_train = train_vectorizer.fit_transform(X_train)
        transformer_tf= TfidfTransformer(use_idf= True, norm= 'l2', smooth_idf=True) 
        # if normalize None, it is the same as CountVect
        # l2 normalization: like percentage of values
        X_train= transformer_tf.fit_transform(X_train)
           
        train_vocab= train_vectorizer.vocabulary_   
        test_vectorizer = CountVectorizer(vocabulary=train_vocab)
        X_test = test_vectorizer.fit_transform(X_test)
        X_test= transformer_tf.fit_transform(X_test)
        
        clf.fit(X_train, y_train)
        sc= accuracy_score(y_test, clf.predict(X_test))
        scores.append(sc)
        print("fold "+ str(i)+ " accuracy: "+ str(sc))
        i+= 1
    print("Mean score: "+ str(np.mean(scores)))
    print(" ")



### Look at the following small examples to ensure that everything works as expected:

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
>>> corpus = [
...     'This is the first document.',
...     'This document is the second document.',
...     'And this is the third one.',
...     'Is this the first document?',
... ]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())
print(X.toarray().shape)  

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer_tf= TfidfTransformer(use_idf= False, norm=None)

X_tf= transformer_tf.fit_transform(X)
print(X_tf.toarray())  

In [None]:
transformer_tfidf= TfidfTransformer(use_idf= True, norm=None)

X_tfidf= transformer_tfidf.fit_transform(X)
print(X_tfidf.toarray()) 

In [None]:
import numpy as np
print((X.toarray()).shape[0])
print((X.toarray()).shape[1])
X_binary= np.zeros(((X.toarray()).shape[0],(X.toarray()).shape[1]))
print(X_binary.shape)
for i in range(len(X.toarray())):
    for j in range(len(X.toarray()[i])):
        if X.toarray()[i,j]>0:
            X_binary[i,j]= True
        else:
            X_binary[i,j]= False
print(X_binary)