In [2]:
import numpy as np
import pandas as pd
import glob
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection, svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from scipy.stats import norm
import random
from numpy.linalg import inv
from math import sqrt, log, exp, pi

In [None]:
class GMM:
    def __init__(self, k, data, iter):
        self.dim = np.shape(data)[1] #assuming data stored row-wise
        self.k = k
        self.N = np.shape(data)[0] #length of data
        self.mu = []
        self.covar = []
        mu_min = []
        mu_max = []
        for j in range(self.dim):
            mu_min.append(min(data[:, j]))
            mu_max.append(max(data[:, j]))
        for i in range(k):
            tmp_mu = [random.uniform(mu_min[j], mu_max[j]) for j in range(self.dim)]
            norm_mu = np.linalg.norm(tmp_mu)
            mu_i = np.divide(tmp_mu, norm_mu)
            self.mu.append(mu_i)
            var_i = [[random.uniform(0.1, 1) for _ in range(self.dim)] for _ in range(self.dim)]
            self.covar.append(var_i)
        self.pi = [1/k for i in range(k)]
        self.data = data
        self.loglike = []
        self.iterations = iter
    
    def pdf(self, data, mu, cov, dim):
        y = []
        for datum in data:
            data_np = np.array(datum)
            cov_inv= inv(cov)
            u = np.dot((data_np - mu).T, cov_inv)
            u = np.dot(u,(data_np - mu))
            u = u/2
            cov_det = abs(np.linalg.det(cov))
            y1 = (1 / (pow(2 * pi, dim/2)*pow(cov_det,1/2)))*exp(-1/2 * np.dot(np.dot((data_np-mu).T, np.linalg.inv(cov)), (data_np-mu)))
            #y2 = 1/pow((2*pi), -dim/2) * pow(abs(np.linalg.det(cov)), -1/2) * \
            #            np.exp(-1/2 * np.dot(np.dot((data_np-mu).T, np.linalg.inv(cov)), (data_np-mu)))
            y.append(y1)
        return y
    
    def E_step(self):
        likelihood = []
        like_sum = []
        for gauss in range(self.k):
            y = pdf(self.data, self.mu[gauss], self.cov[guass], self.dim)
            tmp_like = [y[i]*self.pi[gauss] for i in range(self.N)]
            if gauss == 0:
                like_sum = tmp_like
            else:
                like_sum = np.add(like_sum, tmp_like)
            likelihood.append(tmp_like)
        for i in range(k):
            likelihood[i] = np.divide(likelihood, like_sum)        
        return likelihood

    def M_step(self):
        likelihood = E_step()
        likelihood_sum = [sum(like) for like in likelihood]
        
        for i in range(self.k):
            likelihood_sum = [sum(likelihood[i])]
            ## Update pi
            self.pi[i] = likelihood_sum/self.N
            ## Update mean
            x = [np.dot(w,d) for (w,d) in zip(likelihood, self.data)]
            x = np.array(x)
            mu[i] = [ np.divide(sum(x[:,col]),likelihood_sum) for col in range(self.dim)]
            
            ## Update covariance matrix
            tmp_mult = [np.multiply((np.outer((x-u)),(x-u)), l) for (x,u,l) in zip(self.data, mu[i], likelihood[i])]
            tmp_mult = sum(np.array(tmp_mult))
            self.covar = np.divide(np.array(tmp_mult), likelihood_sum)

    def run():
        for iter in range(self.iterations):
            self.loglike = E_step()
            M_step()

        print(self.mu)
        print(self.pi)
        print(self.covar)
        
    def predict(data):
        clust_pred = []
        for datum in data:
            y_predict = []
            for i in range(self.k):
                y = self.pdf(data, self.mu[i], self.covar[i], self.dim)
                y_predict.append(y)
            pred_cluster = list.index(max(y_predict))
            clust_pred.append(pred_cluster)
        return clust_pred

In [2]:
def get_data(path):
    files = glob.glob(path)
    cols = ['text', 'label']
    df = pd.DataFrame(columns = cols)
    #print(df)
    df_list = []
    for file in files:
        f = open(file, mode = 'r', errors = 'ignore')
        txt_data = f.read()
        txt_data = ''.join([i if ord(i) < 128 else ' ' for i in txt_data])
        name_split = file.split('.')
        name_split = name_split[0].split('_')
        label = int(name_split[1])
        df_list.append([txt_data, label])
        f.close()
    df2 = pd.DataFrame(df_list, columns = cols)
    df = df.append(df2)
    return df

['dataset/274_4.txt', 'dataset/100_1.txt', 'dataset/100_2.txt', 'dataset/100_3.txt', 'dataset/100_4.txt', 'dataset/100_5.txt', 'dataset/101_1.txt', 'dataset/101_2.txt', 'dataset/101_3.txt', 'dataset/101_4.txt', 'dataset/101_5.txt', 'dataset/102_1.txt', 'dataset/102_2.txt', 'dataset/102_3.txt', 'dataset/102_4.txt', 'dataset/102_5.txt', 'dataset/103_1.txt', 'dataset/103_2.txt', 'dataset/103_3.txt', 'dataset/103_4.txt', 'dataset/103_5.txt', 'dataset/104_1.txt', 'dataset/104_2.txt', 'dataset/104_3.txt', 'dataset/104_4.txt', 'dataset/104_5.txt', 'dataset/105_1.txt', 'dataset/105_2.txt', 'dataset/105_3.txt', 'dataset/105_4.txt', 'dataset/105_5.txt', 'dataset/106_1.txt', 'dataset/106_2.txt', 'dataset/106_3.txt', 'dataset/106_4.txt', 'dataset/106_5.txt', 'dataset/107_1.txt', 'dataset/107_2.txt', 'dataset/107_3.txt', 'dataset/200_2.txt', 'dataset/200_3.txt', 'dataset/200_4.txt', 'dataset/200_5.txt', 'dataset/201_1.txt', 'dataset/201_2.txt', 'dataset/201_3.txt', 'dataset/201_4.txt', 'dataset/201

In [3]:
df = get_data('dataset/*.txt')

text_np = df['text'].to_numpy()
label_np = df['label'].to_numpy()
Train_X, Test_X, Train_Y, Test_Y = train_test_split(text_np,label_np,test_size=0.2, random_state=42)

<bound method Series.unique of 0       4
1       1
2       2
3       3
4       4
       ..
1720    1
1721    2
1722    3
1723    4
1724    5
Name: label, Length: 1725, dtype: object>


In [6]:
## Encode label
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

In [7]:
tfidf_vectorizer=TfidfVectorizer(stop_words='english')#, preprocessor=my_preprocessor, tokenizer=my_tokenizer)#, ngram_range=(1,2),
                             #)
Train_X_Tfidf=tfidf_vectorizer.fit_transform(Train_X)

Test_X_Tfidf = tfidf_vectorizer.transform(Test_X)

In [8]:
import spacy
from html import unescape

# create a spaCy tokenizer
spacy.load('en')
lemmatizer = spacy.lang.en.English()

# remove html entities from docs and
# set everything to lowercase
def my_preprocessor(doc):
    return(unescape(doc).lower())

# tokenize the doc and lemmatize its tokens
def my_tokenizer(doc):
    tokens = lemmatizer(doc)
    return([token.lemma_ for token in tokens])

In [32]:

print(type(Train_X_Tfidf[0,:]))

<class 'scipy.sparse.csr.csr_matrix'>


In [127]:
first_vector_tfidfvectorizer=Train_X_Tfidf[0]
df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"])
df.sort_values(by=["tfidf"],ascending=False)

Unnamed: 0,tfidf
gene,0.317578
the,0.269123
blue,0.240635
supercomputer,0.190547
machine,0.178051
...,...
eradicate,0.000000
era,0.000000
equivalents,0.000000
equivalent,0.000000


In [98]:
def pdf(data, mu, cov, dim):
        data_np = np.array(data)
        cov_inv= inv(cov)
        u = np.dot((data_np - mu).T, cov_inv)
        u = np.dot(u,(data_np - mu))
        u = u/2
        cov_det = abs(np.linalg.det(cov))
        y1 = (1 / (pow(2 * pi, dim/2)*pow(cov_det,1/2)))*exp(-1/2 * np.dot(np.dot((data_np-mu).T, np.linalg.inv(cov)), (data_np-mu)))
        #y2 = 1/pow((2*pi), -dim/2) * pow(abs(np.linalg.det(cov)), -1/2) * \
        #            np.exp(-1/2 * np.dot(np.dot((data_np-mu).T, np.linalg.inv(cov)), (data_np-mu)))
        return y
    
def E_step(self):
    likelihood = []
    like_sum = []
    for gauss in range(self.k):
        y = pdf(self.data, self.mu[gauss], self.cov[guass], self.dim)
        tmp_like = [y[i]*self.pi[gauss] for i in range(self.N)]
        if gauss == 0:
            like_sum = tmp_like
        else:
            like_sum = np.add(like_sum, tmp_like)
        likelihood.append(tmp_like)
    for i in range(k):
        likelihood[i] = np.divide(likelihood, like_sum)
        
def M_step():
    likelihood = [[1,2,3,4,5], [6,7,8,9,10], [2,4,6,8,10]]
    likelihood_sum = [sum(like) for like in likelihood]
    
    for i in range(self)
        self.pi = likelihood_sum[i]/self.N

SyntaxError: invalid syntax (<ipython-input-98-b3eb8d46d9f9>, line 31)

In [None]:
gmm_obj = GMM()

[[1, 2, 3], [1, 2, 3], [1, 2, 3], [1, 2, 3], [1, 2, 3]]


[[[[0.33333333 0.33333333 0.33333333]
   [0.33333333 0.33333333 0.33333333]
   [0.33333333 0.33333333 0.33333333]
   [0.33333333 0.33333333 0.33333333]
   [0.33333333 0.33333333 0.33333333]]]


 [[[0.33333333 0.33333333 0.33333333]
   [0.33333333 0.33333333 0.33333333]
   [0.33333333 0.33333333 0.33333333]
   [0.33333333 0.33333333 0.33333333]
   [0.33333333 0.33333333 0.33333333]]]


 [[[0.33333333 0.33333333 0.33333333]
   [0.33333333 0.33333333 0.33333333]
   [0.33333333 0.33333333 0.33333333]
   [0.33333333 0.33333333 0.33333333]
   [0.33333333 0.33333333 0.33333333]]]]


TypeError: unsupported operand type(s) for /: 'list' and 'int'

In [37]:
var_i = [[random.uniform(0.1, 1) for i in range(3)] for _ in range(3)]
print(var_i)

[[0.32332075177421404, 0.5933503830748696, 0.33071384707467344], [0.42331729516833405, 0.10606216602493675, 0.7404151143786644], [0.14528818083059314, 0.8802537723694497, 0.3824366561452819]]


In [83]:
print(np.multiply(4,y))

[[ 16  32  48  64  80]
 [ 32  64  96 128 160]
 [ 48  96 144 192 240]
 [ 64 128 192 256 320]
 [ 80 160 240 320 400]]


In [44]:
y = np.random.uniform(0.1, 1, (5,24141,24141))
print(y)

MemoryError: Unable to allocate array with shape (5, 24141, 24141) and data type float64

In [87]:
lis = [[1,2,3],[4,5,6]]
d = [[2,4,6],[8,10,12]]

x = np.add(np.array(lis),np.array(d))
print(x)

[[ 3  6  9]
 [12 15 18]]


In [None]:
class KMeans:
    def __init__(self, k, data, iterate):
        self.dim = np.shape(data)[1] #assuming data stored row-wise
        self.k = k
        self.N = np.shape(data)[0]
        self.data = data
        self.iterations = iterate
        self.centroids = [ data[np.random.random_integers(0, self.N-1) for _ in range(self.k)]
        pred_cluster = [-1]*self.N
                          
    def cluster():
        pred_cluster = []
        sum_for_mean = [np.zeros(self.dim)]*self.k
        pred_count = [np.zeros(self.k)]
        for i in range(self.N):
            min_dist = float("inf")
            min_dist_cluster = 
            for j in range(self.k):
                dist = self.data[i] - centroid[j]
                dist_sqrt = sqrt(np.dot(dist,dist))
                if dist_sq < min_dist:
                    min_dist = dist_sq
                    min_dist_cluster = j
            sum_for_mean[min_dist_cluster] = np.add(sum_for_mean[min_dist_cluster], self.data[i])
            pred_count[min_dist_cluster] += 1
            pred_cluster.append(min_dist_cluster)
        for i in range(self.k):
            self.centroid[i] = np.divide(sum_for_mean[i], pred_count[i])
        return pred_cluster
    def run():
        for i in range(self.iterations):
            self.pred_cluster = cluster()

In [11]:
y = [np.zeros(10)]*3
z = [np.ones(10)]*3
summ = np.add(y[0], z[0])
a = [-1]*10
print(a)
print(summ)

[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [12]:
z =random.randint(0,10)
print(z)

6
