In [20]:
import re
import os
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

np.random.seed(42)
Texts = []
File_names = []

text_file_dir = '/kaggle/input'
for dirname, _, filenames in os.walk(text_file_dir):
    if filenames:
        for filename in filenames:
            text_file = os.path.join(dirname, filename)
            File_names.append(filename)
            with open(text_file) as file:
               for line in file:
                  Texts.append(line)  

In [21]:
def lemmatization(lemmatizer,sentence):
    lem = [lemmatizer.lemmatize(k) for k in sentence]
    lem = set(lem)
    return [k for k in lem]

def remove_stop_words(stopwords_list,sentence):
    return [k for k in sentence if k not in stopwords_list]

def preprocessed_rallies(rallies):
    updated_rallies = []
    for rallie in rallies:
        lemmatizer = WordNetLemmatizer()
        tokenizer = RegexpTokenizer(r'\w+')
        stopwords_list = stopwords.words('english')
        rallie = rallie.lower()
        remove_punc = tokenizer.tokenize(rallie) # Remove puntuations
        remove_num = [re.sub('[0-9]', '', i) for i in remove_punc] # Remove Numbers
        remove_num = [i for i in remove_num if len(i)>0] # Remove empty strings
        lemmatized = lemmatization(lemmatizer,remove_num) # Word Lemmatization
        remove_stop = remove_stop_words(stopwords_list,lemmatized) # remove stop words
        updated_rallie = ' '.join(remove_stop)
        updated_rallies.append(updated_rallie)
    return np.array(updated_rallies)

def get_data(Texts):
    preprocessed_texts = preprocessed_rallies(Texts)
    return preprocessed_texts

In [22]:
preprocessed_texts = get_data(Texts)

In [23]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(preprocessed_texts)
vocabulary = vectorizer.get_feature_names()
print("Input feature shape : {}".format(X.shape))

Input feature shape : (35, 7198)


In [24]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(X)

KMeans(n_clusters=2, random_state=42)

In [25]:
sentiments = kmeans.predict(X)

results = {
        'File Name' : File_names,
        'Sentiments': sentiments
          }

df = pd.DataFrame(results) 
df.to_csv('/kaggle/working/sentiments.csv', index=False)