# Assignment 5
## CS 156 | Prof. Sterne
### Anirudh Nair

##### Loading the required libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from random import uniform
from math import sqrt, log, exp, pi


##### Loading the dataset

In [None]:
data = pd.read_csv('https://course-resources.minerva.kgi.edu/uploaded_files/mke/Y5GZpr/anonymized.csv')
data.head(n=10)

Unnamed: 0,Date,Amount
0,25May2016,54241.35
1,29May2017,54008.83
2,30Jun2017,54008.82
3,05Jan2017,52704.37
4,23Feb2017,52704.36
5,04Nov2016,52000.0
6,28Sep2017,49878.83
7,04Apr2017,49878.82
8,25Jul2017,49878.82
9,24Aug2017,49878.82


##### Seperating the datetime values in the dataset for ease of usage

In [None]:
dates = list(map(lambda date: datetime.strptime(date, '%d%b%Y'), data['Date']))
day = pd.Series(list(map(lambda date: date.day, dates)))
month = pd.Series(list(map(lambda date: date.month, dates)))
year = pd.Series(list(map(lambda date: date.year, dates)))
day.name = 'Day'
month.name = 'Month'
year.name = 'Year'

newdata = pd.concat([day,month,year,data.Amount], axis =1)
newdata.head(n=10)

Unnamed: 0,Day,Month,Year,Amount
0,25,5,2016,54241.35
1,29,5,2017,54008.83
2,30,6,2017,54008.82
3,5,1,2017,52704.37
4,23,2,2017,52704.36
5,4,11,2016,52000.0
6,28,9,2017,49878.83
7,4,4,2017,49878.82
8,25,7,2017,49878.82
9,24,8,2017,49878.82


##### Creating a function to retreive the Gaussian Mixture Model for the given data

In [None]:
#code retreived from class reading at : https://www.youtube.com/watch?v=JNlEIEwe-Cg&t=143s

class Gaussian:
    "Model univariate Gaussian"
    def __init__(self, mu, sigma):
        #mean and standard deviation
        self.mu = mu
        self.sigma = sigma

    #probability density function
    def pdf(self, datum):
        "Probability of a data point given the current parameters"
        u = (datum - self.mu) / abs(self.sigma)
        y = (1 / (sqrt(2 * pi) * abs(self.sigma))) * exp(-u * u / 2)
        return y
    #printing model values
    def __repr__(self):
        return 'Gaussian({0:4.6}, {1:4.6})'.format(self.mu, self.sigma)

class GaussianMixture:
    "Model mixture of two univariate Gaussians and their EM estimation"

    def __init__(self, data, mu_min=min(data), mu_max=max(data), sigma_min=.1, sigma_max=1, mix=.5):
        self.data = data
        #init with multiple gaussians
        self.one = Gaussian(uniform(mu_min, mu_max), 
                            uniform(sigma_min, sigma_max))
        self.two = Gaussian(uniform(mu_min, mu_max), 
                            uniform(sigma_min, sigma_max))
        
        #as well as how much to mix them
        self.mix = mix

    def Estep(self):
        "Perform an E(stimation)-step, freshening up self.loglike in the process"
        # compute weights
        self.loglike = 0. # = log(p = 1)
        for datum in self.data:
            # unnormalized weights
            wp1 = self.one.pdf(datum) * self.mix
            wp2 = self.two.pdf(datum) * (1. - self.mix)
            # compute denominator
            den = wp1 + wp2
            # normalize
            wp1 /= den
            wp2 /= den
            # add into loglike
            self.loglike += log(wp1 + wp2)
            # yield weight tuple
            yield (wp1, wp2)

    def Mstep(self, weights):
        "Perform an M(aximization)-step"
        # compute denominators
        (left, rigt) = zip(*weights)
        one_den = sum(left)
        two_den = sum(rigt)
        # compute new means
        self.one.mu = sum(w * d / one_den for (w, d) in zip(left, data))
        self.two.mu = sum(w * d / two_den for (w, d) in zip(rigt, data))
        # compute new sigmas
        self.one.sigma = sqrt(sum(w * ((d - self.one.mu) ** 2)
                                  for (w, d) in zip(left, data)) / one_den)
        self.two.sigma = sqrt(sum(w * ((d - self.two.mu) ** 2)
                                  for (w, d) in zip(rigt, data)) / two_den)
        # compute new mix
        self.mix = one_den / len(data)

    def iterate(self, N=1, verbose=False):
        "Perform N iterations, then compute log-likelihood"

    def pdf(self, x):
        return (self.mix)*self.one.pdf(x) + (1-self.mix)*self.two.pdf(x)
        
    def __repr__(self):
        return 'GaussianMixture({0}, {1}, mix={2.03})'.format(self.one, 
                                                              self.two, 
                                                              self.mix)

    def __str__(self):
        return 'Mixture: {0}, {1}, mix={2:.03})'.format(self.one, 
                                                        self.two, 
                                                        self.mix)

##### Density Model for the number of transactions occuring in a single month

In [None]:
#retreiving the required data
month_data = newdata.sort_values(by=['Year','Month'])
transactions = []
cur = -1
count = 0

for i in month_data.index:
    if month_data.loc[i, 'Month'] != cur:
        if cur != -1: # not first iteration
            transactions.append(count)
        cur = month_data.loc[i, 'Month']
        count = 0
    count += 1
    

In [None]:

#fitting the gaussian mixture models
n_iterations = 5
best_mix = None
best_loglike = float('-inf')
mix = GaussianMixture(transactions)
for _ in range(n_iterations):
    try:
        #train!
        mix.iterate(verbose=True)
        if mix.loglike > best_loglike:
            best_loglike = mix.loglike
            best_mix = mix
        
    except (ZeroDivisionError, ValueError, RuntimeWarning): # Catch division errors from bad starts, and just throw them out...
        pass

TypeError: ignored