In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [9]:
# read the data with column names category, text
df = pd.read_csv(r'Data\all-data.csv', encoding='ISO-8859-1', names=['category', 'text'])

In [13]:
df.head()

Unnamed: 0,category,text
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


### 1. Preprocessing

In [12]:
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
stopwords = set(stopwords)
 
def remove_punctuation(text):
    import string
    new_string = ""
    for char in text:
        if char not in string.punctuation:
            new_string += char
    return new_string

def remove_stopwords(text):
    
    new_string = ""
    for word in text.split():
        if word not in stopwords:
            new_string += word + " "
    return new_string

def lower_case(text):
    return text.lower()

def tokenize(text):
    return text.split()

def lemmatize(text):
    from nltk.stem import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    new_string = []
    for word in text:
        new_string.append(lemmatizer.lemmatize(word))
    return new_string

In [14]:
df['text'] = df['text'].apply(remove_punctuation)
df['text'] = df['text'].apply(lower_case)
df['text'] = df['text'].apply(remove_stopwords)

In [16]:
df['text'] = df['text'].apply(tokenize)
df['text'] = df['text'].apply(lemmatize)

In [22]:
class TF_ICF:
    def __init__(self, classes, ls_text, ls_classes):
        self.classes = classes
        self.ls_text = ls_text
        self.ls_classes = ls_classes
        self.tf = {}
        self.cf = {}
        self.icf = {}
        self.tf_icf = {}
        self.build()
        
    def build(self):
        for class_ in self.classes:
            self.tf[class_] = {}
            self.build_tf(class_)
        self.build_cf()
        self.build_icf()
    
    def build_tf(self, class_):
        filtered_text = [self.ls_text[i] for i in range(len(self.ls_text)) if self.ls_classes[i] == class_]
        for text in filtered_text:
            for word in text:
                if word not in self.tf[class_]:
                    self.tf[class_][word] = 0
                self.tf[class_][word] += 1

    def build_cf(self):
        for i in range(len(self.ls_text)):
            current_class = self.ls_classes[i]
            for word in self.ls_text[i]:
                if word not in self.cf:
                    self.cf[word] = set()
                self.cf[word].add(current_class)
        # replce set with length of set
        for word in self.cf:
            self.cf[word] = len(self.cf[word])
    
    def build_icf(self):
        for word in self.cf:
            self.icf[word] = np.log(len(self.classes)/self.cf[word])

    def build_tf_icf(self):
        

In [23]:
tf_icf = TF_ICF(set(df['category']), df['text'], df['category'])

In [28]:
tf_icf.icf

{'according': 0.0,
 'gran': 0.4054651081081644,
 'company': 0.0,
 'plan': 0.0,
 'move': 0.0,
 'production': 0.0,
 'russia': 0.0,
 'although': 0.4054651081081644,
 'growing': 0.4054651081081644,
 'technopolis': 0.4054651081081644,
 'develop': 0.4054651081081644,
 'stage': 0.4054651081081644,
 'area': 0.0,
 'le': 0.0,
 '100000': 1.0986122886681098,
 'square': 1.0986122886681098,
 'meter': 1.0986122886681098,
 'order': 0.0,
 'host': 1.0986122886681098,
 'working': 0.0,
 'computer': 0.0,
 'technology': 0.0,
 'telecommunication': 0.0,
 'statement': 0.4054651081081644,
 'said': 0.0,
 'international': 0.0,
 'electronic': 0.0,
 'industry': 0.0,
 'elcoteq': 0.0,
 'laid': 1.0986122886681098,
 'ten': 0.0,
 'employee': 0.0,
 'tallinn': 0.0,
 'facility': 0.0,
 'contrary': 1.0986122886681098,
 'earlier': 0.0,
 'layoff': 0.0,
 'contracted': 0.4054651081081644,
 'rank': 0.4054651081081644,
 'office': 0.0,
 'worker': 0.0,
 'daily': 0.0,
 'postimees': 1.0986122886681098,
 'reported': 0.0,
 'new': 0.0,
 