In [227]:
from scipy.stats import norm
from scipy.stats import expon

from sklearn.metrics import confusion_matrix

import pandas as pd
import numpy as np

In [233]:
EPS = np.finfo(float).eps

class GEM:
    def __init__(self):
        self.thresh = 10*EPS
        self.n_iter = 100
        self.alpha = 0.5
        self.beta = 1.0
        self.mu = 0.0
        self.sigma = 1.0
        return 
    
    def fit(self,X):
        oldP = None
        for i in range(self.n_iter):
            logP,r1,r2 = self.score_samples(X)
            #print np.round(logP,3)
            if oldP:
                if abs(logP - oldP)<self.thresh:
                    self.end_iter = i
                    self.print_info()
                    return
                
            oldP = logP
            self._do_mstep(r1,r2)
            
            
        print "converge fail!"
        return
            
    def score_samples(self,X):
        y1 = self.alpha*expon(scale=self.beta).pdf(X)
        y2 = (1-self.alpha)*norm(self.mu,self.sigma).pdf(X)
        return np.sum(np.log(y1+y2)),y1/(y1+y2),y2/(y1+y2)
    
    def _do_mstep(self,r1,r2):
        self.r = r1
        self.alpha = np.mean(r1)
        self.beta = np.sum(r1*X) / np.sum(r1)
        mu = np.sum(r2*X) / np.sum(r2)
        covar = np.sum(r2*(X-mu)**2)/np.sum(r2)
        self.mu = mu
        self.sigma = np.sqrt(covar)
        return
    
    def print_info(self):
        print "After ", self.end_iter, " iters, converged!"
        print "Weights: ", np.round(self.alpha,2), "\t", np.round(1-self.alpha,2)
        print "beta: ", self.beta
        print "mu: ", self.mu
        print "sigma: ", self.sigma
        return

In [250]:
df = pd.read_csv('labeled_sina_new.csv')
df2 = df.loc[df['state'].isin([1,3]),['ewav_back buy/sell','state']]
df2 = df2[(df2['ewav_back buy/sell']<500)&(df2['ewav_back buy/sell']>-30)]
X = df2['ewav_back buy/sell']

In [251]:
g = GEM()

In [252]:
g.fit(X)

After  62  iters, converged!
Weights:  0.45 	0.55
beta:  35.2216462488
mu:  -0.340088017395
sigma:  8.7379028


In [253]:
y_pred = np.where(g.r > 0.6,3,1) 

In [254]:
y_true = df2['state']

In [255]:
confusion_matrix(y_true, y_pred)

array([[105,  11],
       [ 63,  73]])

In [256]:
df = pd.read_csv('labeled_sina_new.csv')
df2 = df.loc[df['state'].isin([0,2]),['ewav_back sell/buy','state']]
df2 = df2[(df2['ewav_back sell/buy']<500)&(df2['ewav_back sell/buy']>-30)]
X = df2['ewav_back sell/buy']

In [257]:
g.fit(X)

After  29  iters, converged!
Weights:  0.35 	0.65
beta:  59.2166706124
mu:  0.0074194233831
sigma:  0.0103804521055


In [258]:
y_pred = np.where(g.r > 0.6,2,0) 

In [259]:
y_true = df2['state']

In [260]:
confusion_matrix(y_true, y_pred)

array([[75, 26],
       [ 0, 14]])

In [261]:
y_true.value_counts()

0    101
2     14
Name: state, dtype: int64