In [31]:
import pandas as pd
import numpy as np
import re
from sklearn.metrics import classification_report
from time import time
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    words = text.split()
    return words

def create_bag_of_words(docs):
    word_freq = {}
    for doc in docs:
        words = preprocess_text(doc)
        for word in words:
            if word not in word_freq:
                word_freq[word] = 0
            word_freq[word] += 1

    unique_words = list(word_freq.keys())

    bag_of_words = []
    for doc in docs:
        words = preprocess_text(doc)
        vector = [0] * len(unique_words)
        for i, word in enumerate(unique_words):
            if word in words:
                vector[i] = words.count(word)
        bag_of_words.append(vector)

    return bag_of_words, unique_words

documents = ["Великолепный сериал, который поможет успокоить нервы при любых стрессах и просто скрасит серые будни",
         "Пожалуй, если бы я посмотрел только первые пару сезонов этого сериала, я бы с легкой руки написал ему положительную рецензию",
         "В общем, если создатели этого сериала не вернут всё на круги своя, то рейтинги следующих сезонов будут становится все ниже и ниже, а зрительская аудитория будет все меньше и меньше."]



bag_of_words, unique_words = create_bag_of_words(documents)

print("Мешок слов:")
for vector in bag_of_words:
    print(vector)

print("\nУникальные слова:")
print(unique_words)


Мешок слов:
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 2]

Уникальные слова:
['великолепный', 'сериал', 'который', 'поможет', 'успокоить', 'нервы', 'при', 'любых', 'стрессах', 'и', 'просто', 'скрасит', 'серые', 'будни', 'пожалуй', 'если', 'бы', 'я', 'посмотрел', 'только', 'первые', 'пару', 'сезонов', 'этого', 'сериала', 'с', 'легкой', 'руки', 'написал', 'ему', 'положительную', 'рецензию', 'в', 'общем', 'создатели', 'не', 'вернут', 'всё', 'на', 'круги', 'своя', 'то', 'рейтинги', 'следующих', 'будут', 'становится', 'все', 'ниже', 'а', 'зрительская', 'аудитория',

In [11]:
import math

def calculate_tf(term, document):
    word_count = len(document.split())
    term_count = document.split().count(term)
    tf = term_count / word_count
    return tf

def calculate_idf(term, documents):
    document_count = len(documents)
    term_occurrences = sum(1 for document in documents if term in document)
    idf = math.log((document_count + 1) / (1 + term_occurrences)) + 1
    return idf

def calculate_tfidf(term, document, documents):
    tf = calculate_tf(term, document)
    idf = calculate_idf(term, documents)
    tfidf = tf * idf
    return tfidf

def calculate_tfidf_for_documents(documents):
    tfidf_documents = []
    for document in documents:
        tfidf_document = {}
        document_terms = document.split()
        for term in document_terms:
            tfidf_document[term] = calculate_tfidf(term, document, documents)
        tfidf_documents.append(tfidf_document)
    return tfidf_documents

documents =["Великолепный сериал, который поможет успокоить нервы при любых стрессах и просто скрасит серые будни",
         "Пожалуй, если бы я посмотрел только первые пару сезонов этого сериала, я бы с легкой руки написал ему положительную рецензию",
         "В общем, если создатели этого сериала не вернут всё на круги своя, то рейтинги следующих сезонов будут становится все ниже и ниже, а зрительская аудитория будет все меньше и меньше."]

tfidf_documents = calculate_tfidf_for_documents(documents)
for i, document in enumerate(tfidf_documents):
    print(f"TF-IDF for document {i+1}:")
    for term, tfidf in document.items():
        print(f"{term}: {tfidf}")
    print()

TF-IDF for document 1:
Великолепный: 0.12093908432571038
сериал,: 0.12093908432571038
который: 0.12093908432571038
поможет: 0.12093908432571038
успокоить: 0.12093908432571038
нервы: 0.12093908432571038
при: 0.12093908432571038
любых: 0.12093908432571038
стрессах: 0.12093908432571038
и: 0.07142857142857142
просто: 0.12093908432571038
скрасит: 0.12093908432571038
серые: 0.12093908432571038
будни: 0.12093908432571038

TF-IDF for document 2:
Пожалуй,: 0.08465735902799727
если: 0.06438410362258905
бы: 0.1287682072451781
я: 0.1287682072451781
посмотрел: 0.08465735902799727
только: 0.08465735902799727
первые: 0.08465735902799727
пару: 0.08465735902799727
сезонов: 0.06438410362258905
этого: 0.06438410362258905
сериала,: 0.08465735902799727
с: 0.05
легкой: 0.08465735902799727
руки: 0.08465735902799727
написал: 0.08465735902799727
ему: 0.08465735902799727
положительную: 0.08465735902799727
рецензию: 0.08465735902799727

TF-IDF for document 3:
В: 0.04292273574839269
общем,: 0.05643823935199818
ес

In [13]:
def get_matrix(document):
    matrix = []
    result = {}
    for d in document:
        result.update(d)
    unique_words = list(result.keys())
    for words in document:
        vector = [0] * len(unique_words)
        for i, word in enumerate(unique_words):
            if word in words:
                vector[i] = result[word]
        matrix.append(vector)
    return matrix


f = get_matrix(tfidf_documents)
data = pd.DataFrame(f)
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,46,47,48,49,50,51,52,53,54,55
0,0.120939,0.120939,0.120939,0.120939,0.120939,0.120939,0.120939,0.120939,0.120939,0.066667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.066667,...,0.056438,0.112876,0.056438,0.056438,0.033333,0.056438,0.056438,0.056438,0.056438,0.056438


In [20]:
from sklearn.model_selection import train_test_split

df = pd.read_csv('../data/spam.csv', encoding='latin-1')
y = df["v1"]
X = list(df["v2"])

tfidf_documents = calculate_tfidf_for_documents(X)
for i, document in enumerate(tfidf_documents):
    if(i == 5):
        break
    print(f"TF-IDF for document {i+1}:")
    for term, tfidf in document.items():
        print(f"{term}: {tfidf}")
    print()

TF-IDF for document 1:
Go: 0.22361985965358788
until: 0.31649259607826036
jurong: 0.4466270803504795
point,: 0.4466270803504795
crazy..: 0.4466270803504795
Available: 0.41196972132248233
only: 0.22247695274978538
in: 0.07827322735642894
bugis: 0.38398893192571115
n: 0.05569551144273989
great: 0.25856707456580147
world: 0.31467421386971667
la: 0.13288897928838259
e: 0.05328138479630617
buffet...: 0.4466270803504795
Cine: 0.4466270803504795
there: 0.21999710569281677
got: 0.20363646013239595
amore: 0.4466270803504795
wat...: 0.3570391068890768

TF-IDF for document 2:
Ok: 0.7342221604152019
lar...: 1.1225528382788952
Joking: 1.4887569345015983
wif: 0.9936878569066483
u: 0.20750047346799366
oni...: 1.421179416483571

TF-IDF for document 3:
Free: 0.1963769429473016
entry: 0.46358532654574186
in: 0.055909448111734955
2: 0.10183803218614007
a: 0.039071469642958795
wkly: 0.25813548267039876
comp: 0.18189604622915287
to: 0.20465563425528513
win: 0.1710502457363592
FA: 0.4700833107032938
Cup: 0.

In [21]:
matr = get_matrix(tfidf_documents)
data = pd.DataFrame(matr)
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15575,15576,15577,15578,15579,15580,15581,15582,15583,15584
0,0.319457,0.210995,0.446627,0.446627,0.446627,0.374518,0.222477,0.06021,0.187312,0.111391,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.06021,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,...,0.0,0.297751,0.297751,0.297751,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5568,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,1.029924,0.000000,0.000000,0.000000,0.000000,0.000000
5569,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.06021,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.893254,0.893254,0.893254,0.000000,0.000000
5570,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.06021,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.343559,0.000000


In [23]:
maping = {
    "ham" : 1,
    "spam": 0
}

y = y.replace(maping)
y

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: v1, Length: 5572, dtype: int64

In [25]:
class MetricsClassification:
    @staticmethod
    def accuracy(y_test, y_pred):
        y_true, predictions = np.array(y_test), np.array(y_pred)
        return len([x for x, y  in zip(y_true, predictions) if x  == y])/len(y_true)
    
    @staticmethod
    def confusion_matrix(y_test, y_pred):
        y_true, predictions = np.array(y_test), np.array(y_pred)
        TP = FP =TN =FN = 0
        for test, pred in zip(y_true, predictions):
            if (test == 1 and pred == 1):
                TP += 1 
            elif (test == 0 and pred == 0):
                TN += 1
            elif (test == 1 and pred == 0):
                FN += 1
            elif (test == 0 and pred == 1):
                FP += 1
        return [[TP, FP],[FN, TN]]
    @staticmethod
    def precision( y_test, y_pred):
        matrix = MetricsClassification.confusion_matrix(y_test, y_pred)
        TP = matrix[0][0]
        FP = matrix[0][1]
        return TP/(TP + FP)
    @staticmethod
    def recall(y_test, y_pred):
        matrix = MetricsClassification.confusion_matrix(y_test, y_pred)
        TP = matrix[0][0]
        FN = matrix[1][0]
        return TP/(TP + FN)
    
    @staticmethod
    def f_score(y_test, y_pred):
        recall_score = MetricsClassification.recall(y_test, y_pred)
        precision_score = MetricsClassification.precision(y_test, y_pred)
        return 2*(recall_score * precision_score)/ (recall_score+precision_score)

In [26]:
def pca(X, num_components):
    X_meaned = X - np.mean(X, axis=0)
    cov_matrix = np.cov(X_meaned, rowvar=False)
    eigen_values, eigen_vectors = np.linalg.eigh(cov_matrix)
    sorted_index = np.argsort(eigen_values)[::-1]
    sorted_eigenvalues = eigen_values[sorted_index]
    sorted_eigenvectors = eigen_vectors[:, sorted_index]
    eigenvector_subset = sorted_eigenvectors[:, 0:num_components]
    X_reduced = np.dot(eigenvector_subset.transpose(), X_meaned.transpose()).transpose()
    return X_reduced

In [27]:
X = pca(data, 100)

In [28]:
class KNN:
    def __init__(self, k):
        self.k = k
        
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
        
    def predict(self, X):
        predictions = []
        for x in X:
            distances = np.sqrt(np.sum((self.X_train - x) ** 2, axis=1))
            indices = np.argsort(distances)[:self.k]
            k_nearest_labels = np.array(self.y_train)[indices]
            most_common_label = np.bincount(k_nearest_labels).argmax()
            predictions.append(most_common_label)
        return predictions


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=4)
knn = KNN(10)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)
print(MetricsClassification.accuracy(y_test, predictions))
print(MetricsClassification.confusion_matrix(y_test, predictions))
print(MetricsClassification.precision(y_test, predictions))
print(MetricsClassification.recall(y_test, predictions))
print(MetricsClassification.f_score(y_test, predictions))

0.9330143540669856
[[690, 42], [14, 90]]
0.9426229508196722
0.9801136363636364
0.9610027855153203


In [30]:
classification_report(y_test, predictions)

              precision    recall  f1-score   support

           0       0.87      0.68      0.76       132
           1       0.94      0.98      0.96       704

    accuracy                           0.93       836
   macro avg       0.90      0.83      0.86       836
weighted avg       0.93      0.93      0.93       836



In [32]:
n_samples = 2000
n_features = 1000
n_topics = 10
n_top_words = 20

In [33]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))

In [35]:
df = pd.read_csv('../data/spam.csv', encoding='latin-1')
y = df["v1"]
X = df["v2"]
data_samples = X

In [36]:
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')
tf = tf_vectorizer.fit_transform(data_samples)

In [37]:
lda = LatentDirichletAllocation(n_components=n_topics, max_iter=20, 
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0).fit(tf)

In [38]:
print_top_words(lda, tf_vectorizer.get_feature_names_out(), 10)

Topic #0:
just ask hey meet min house live end month ah
Topic #1:
prize claim won number cash urgent win txt guaranteed cool
Topic #2:
ok good did way doing night today oh home just
Topic #3:
come ur da ì_ send msg service customer care soon
Topic #4:
day going great said went happy wish hope man lunch
Topic #5:
gt lt know don time want just need think dont
Topic #6:
lor got like wat say pls lol buy cos dun
Topic #7:
free stop reply text txt new www mobile com week
Topic #8:
sorry later love ll dear amp ur life thanks gud
Topic #9:
hi home im babe love phone miss tell mobile text
