In [1]:
############# STEP 1
############# LOADING THE DATASET
import pandas as pd
import numpy as np 
import re, nltk
from sklearn.feature_extraction.text import CountVectorizer        
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from collections import Counter

test_data_file_name='testing.txt'
train_data_file_name='training.txt'
test_data_df = pd.read_csv(test_data_file_name, header=None, delimiter=";")
test_data_df.columns = ["Text"]
train_data_df = pd.read_csv(train_data_file_name, header=None, delimiter=";")

train_data_df.columns = ["Type","Text"]
TypeText=['positive','negative','neutral']

In [2]:
train_data_df.head(2)

Unnamed: 0,Type,Text
0,0,today is the last class of business outcomes
1,0,there is a beautiful weather outside


In [3]:
test_data_df

Unnamed: 0,Text
0,i have computer and laptop
1,I don't live in Sopot
2,winter break is in 4 weeks


In [4]:
train_data_df.shape


(215, 2)

In [5]:
train_data_df['Type'].groupby(train_data_df['Type']).count()

Type
0    71
1    71
2    73
Name: Type, dtype: int64

In [6]:
############# STEP 3
np.mean([len(s.split(" ")) for s in train_data_df.Text])

stemmer = PorterStemmer()

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    text = re.sub("[^a-zA-Z]", " ", text)
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems

vectorizer = CountVectorizer(
    analyzer = 'word',
    tokenizer = tokenize,
    lowercase = True,
    # !!! replace the line below with your stopwords list like this
    # stop_words=POLISH_STOP_WORDS,
    stop_words = 'english',
    max_features = 85
)

corpus_data_features = vectorizer.fit_transform(
    train_data_df.Text.tolist() + test_data_df.Text.tolist())
corpus_data_features_nd = corpus_data_features.toarray()
corpus_data_features_nd.shape
vocab = vectorizer.get_feature_names_out()
dist = np.sum(corpus_data_features_nd, axis=0)
X_train, X_test, y_train, y_test  = train_test_split(
        corpus_data_features_nd[0:len(train_data_df)], 
        train_data_df.Type,
        train_size=0.85, 
        random_state=1234)
log_model = LogisticRegression()
log_model = log_model.fit(X=X_train, y=y_train)
y_pred = log_model.predict(X_test)




In [7]:
#what is stemming?
print(vocab)

['bad' 'beauti' 'best' 'black' 'brand' 'break' 'busi' 'christma' 'class'
 'cold' 'color' 'comput' 'dark' 'day' 'desk' 'don' 'dure' 'earli' 'evalu'
 'experi' 'fast' 'fight' 'final' 'finish' 'good' 'grey' 'ha' 'happi'
 'hard' 'holiday' 'hp' 'idea' 'iiyama' 'improv' 'insid' 'intel' 'job'
 'laptop' 'like' 'lot' 'love' 'm' 'mani' 'monitor' 'month' 'morn' 'new'
 'nice' 'onli' 'outcom' 'outsid' 'pc' 'phone' 'plain' 'prefer' 'project'
 'realli' 'rest' 'sad' 'school' 'semest' 'ski' 'slow' 'small' 'sopot'
 'stand' 't' 'thi' 'thing' 'think' 'time' 'today' 'upcom' 'use' 'veri'
 'wa' 'wait' 'wake' 'weather' 'week' 'white' 'winter' 'work' 'year'
 'yesterday']


In [8]:
############# STEP 5
############# TESTING TRAINING DATASET
print("Testing the training dataset accuracy...")
print(classification_report(y_test, y_pred))


from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

Testing the training dataset accuracy...
              precision    recall  f1-score   support

           0       1.00      0.67      0.80        12
           1       0.71      1.00      0.83        10
           2       1.00      1.00      1.00        11

    accuracy                           0.88        33
   macro avg       0.90      0.89      0.88        33
weighted avg       0.91      0.88      0.88        33



array([[ 8,  4,  0],
       [ 0, 10,  0],
       [ 0,  0, 11]])

In [9]:
############# STEP 6
############# TESTING THE REAL DATASET
log_model = LogisticRegression()
log_model = log_model.fit(X=corpus_data_features_nd[0:len(train_data_df)], y=train_data_df.Type)
  
test_pred = log_model.predict(corpus_data_features_nd[len(train_data_df):])
    
import random
spl = random.sample(range(len(test_pred)), len(test_pred))
purpose=[]
for text, type in zip(test_data_df.Text[spl], test_pred[spl]):
    print (TypeText[type],':', text)
    purpose.append(type)
print("The following labels were identified:\n" )
c = Counter(purpose)
for letter in c:
    print ('%s: %d' % (letter, c[letter]))

neutral : i have computer and laptop
negative : I don't live in Sopot
positive : winter break is in 4 weeks
The following labels were identified:

2: 1
1: 1
0: 1
