In [0]:
import numpy as np
import pandas as pd
import glob
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection, svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from scipy.stats import norm
import random
from numpy.linalg import inv
from math import sqrt, log, exp, pi
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import random as sparse_random
from sklearn.random_projection import sparse_random_matrix

In [0]:
class GMM:
    def __init__(self, k, data, iter):
        self.dim = np.shape(data)[1] #assuming data stored row-wise
        self.k = k
        self.N = np.shape(data)[0] #length of data
        self.mu = []
        self.covar = []
        mu_min = []
        mu_max = []
        #for j in range(self.dim):
        #    mu_min.append(min(data[:, j]))
        #    mu_max.append(max(data[:, j]))
        for i in range(self.k):
            tmp_mu = np.random.uniform(0.1,1, self.dim) #for j in range(self.dim)]
            norm_mu = np.linalg.norm(tmp_mu)
            mu_i = np.divide(tmp_mu, norm_mu)
            self.mu.append(mu_i)
            var_i = np.random.uniform(0.1, 1, (self.dim, self.dim))  #for _ in range(self.dim)] for _ in range(self.dim)]
            self.covar.append(var_i)
        self.pi = [1/k for i in range(k)]
        self.data = data
        self.loglike = []
        self.iterations = iter
        print("Init done")

    def pdf(self, data, mu, cov, dim):
        y = []
        mu_np = np.array(mu)
        cov = np.add(cov, np.eye(self.dim)*10)
        for datum in data:
            data_np = np.array(datum)
            cov_inv= inv(cov)
            u = np.dot((data_np - mu).T, cov_inv)
            u = np.dot(u,(data_np - mu))
            u = u/2
            cov_det = np.array(abs(np.linalg.det(cov))).item(0)
            x = pow(2 * pi, dim/2)
            c =pow(cov_det,1/2)
            z = (data_np-mu_np.flatten())
            a=np.linalg.inv(cov)
            b=data_np-mu_np.flatten()
            #print("X: ",np.shape(x))
            #print("c: ",np.shape(c))
            #print("z: ",np.shape(data_np))
            #print("b ",np.shape(mu_np))
            #print("det: ",cov_det)
            y1 = (1 / (pow(2 * pi, -dim/2)*pow(cov_det,-1/2)))*exp(-1/2 * np.dot(np.dot((data_np-mu_np.flatten()), np.linalg.inv(cov)), (data_np-mu_np.flatten())))
            #y2 = 1/pow((2*pi), -dim/2) * pow(abs(np.linalg.det(cov)), -1/2) * \
            #            np.exp(-1/2 * np.dot(np.dot((data_np-mu).T, np.linalg.inv(cov)), (data_np-mu)))
            y.append(y1)
        return y
    
    def E_step(self):
        likelihood = []
        like_sum = []
        for gauss in range(self.k):
            y = self.pdf(self.data, self.mu[gauss], self.covar[gauss], self.dim)
            tmp_like = [y[i]*self.pi[gauss] for i in range(self.N)]
            if gauss == 0:
                like_sum = tmp_like
            else:
                like_sum = np.add(like_sum, tmp_like)
            likelihood.append(tmp_like)
        for i in range(self.k):
            likelihood[i] = np.divide(likelihood[i], like_sum[i])   
        print("E-step done")     
        return likelihood

    def M_step(self):
        likelihood = self.E_step()
        print(np.shape(likelihood))
        likelihood_sum = [sum(like) for like in likelihood]
        covariances = []
        #print(np.shape(self.covar))
        for i in range(self.k):
            likelihood_sum = [sum(likelihood[i])]
            ## Update pi
            self.pi[i] = np.divide(likelihood_sum,self.N)
            ## Update mean
            x = [np.dot(w,d) for (w,d) in zip(likelihood[i].flatten(), self.data)]
            x = np.array(x)
            self.mu[i] = [np.divide(sum(x[:,col]),likelihood_sum) for col in range(self.dim)]
            #print(np.shape(self.mu))
            ## Update covariance matrix
            u = self.mu[i]  #np.array(self.mu[i]).flatten()
            #print(np.shape(u))
            mat_sum = []
            for j in range(self.N):
                dat = self.data[j]
                #print(np.shape(dat))
                l = likelihood[i][j]
                #print(l)
                tmp_mult = np.multiply(np.dot(np.array((dat-u)).T,np.array((dat-u))), l)
                #print(np.shape(tmp_mult))
                if j==0:
                    mat_sum = tmp_mult
                else:
                    mat_sum = np.add(np.array(mat_sum), np.array(tmp_mult))
            #print(np.shape(mat_sum))  # 200*200
            covariances.append(np.divide(np.array(mat_sum), likelihood_sum))
            
        self.covar = covariances
        print(np.shape(self.covar))   #5*200*200

    def run(self):
        for iter in range(self.iterations):
            #self.loglike = self.E_step()
            self.M_step()
            self.mu = np.reshape(np.array(self.mu),(self.k, self.dim))
            self.pi = np.reshape(np.array(self.pi),(self.k))
            self.covar = np.reshape(np.array(self.covar),(self.k, self.dim, self.dim))
            print(np.shape(self.mu))
            print(np.shape(self.pi))
            print(np.shape(self.covar))
            print("Iteration ", iter+1," done")

        

In [0]:
def get_data(path):
    files = glob.glob(path)
    cols = ['text', 'label']
    df = pd.DataFrame(columns = cols)
    #print(df)
    df_list = []
    for file in files:
        f = open(file, mode = 'r', errors = 'ignore')
        txt_data = f.read()
        txt_data = ''.join([i if ord(i) < 128 else ' ' for i in txt_data])
        name_split = file.split('.')
        name_split = name_split[0].split('_')
        label = int(name_split[1])
        df_list.append([txt_data, label])
        f.close()
    df2 = pd.DataFrame(df_list, columns = cols)
    df = df.append(df2)
    return df

In [0]:
df = get_data('/content/drive/My Drive/SMAI/Assignment-2/q6/dataset/*.txt')

In [0]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')#, preprocessor=my_preprocessor, tokenizer=my_tokenizer)#, ngram_range=(1,2),)
df_text = tfidf_vectorizer.fit_transform(df['text'])
#Test_X_Tfidf = tfidf_vectorizer.fit_transform(Test_X)
#text_np = df['text'].to_numpy()

In [0]:
svd = TruncatedSVD(n_components=200, n_iter=100, random_state=42)
df_text_new = svd.fit_transform(df_text)

In [0]:
label_np = df['label'].to_numpy()
#Train_X, Test_X, Train_Y, Test_Y = train_test_split(df_text_new,label_np,test_size=0.2, random_state=42)

In [0]:
## Encode label
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

In [50]:
k = 5
iter = 10
print(np.shape(Train_X)[0])
print(np.shape(Train_X)[1])
#for j in range(np.shape(Test_X_Tfidf)[1]):
#    print("min ",j,":",min(Test_X_Tfidf[:, j]))
#    print("max ",j,":",max(Test_X_Tfidf[:, j]))
gmm_obj = GMM(k, Train_X, iter)
gmm_obj.run()

1380
200
Init done
E-step done
(5, 1380)
(5, 200, 200)
(5, 200)
(5,)
(5, 200, 200)
Iteration  1  done
E-step done
(5, 1380)
(5, 1)


ValueError: ignored