In [2]:
import numpy as np
import pandas as pd
from scipy.stats import norm

from sklearn import mixture
from sklearn import cluster
from sklearn.metrics import confusion_matrix
from sklearn.utils.extmath import logsumexp

from pdb import set_trace

In [3]:
EPS = np.finfo(float).eps

class GEM:
    
    def __init__(self,n_components=2):
        self.thresh = 0.00001
        self.n_components=n_components
        self.n_iter=100
        self.weights_ = np.ones(n_components) / n_components
        #self.means_= cluster.KMeans(n_clusters= n_components).fit(X[:,np.newaxis]).cluster_centers_
        self.means_ = np.array([-1e10,0.0])
        #self.covars_= np.ones(n_components)
        self.converged_ = False
        
    def log_pdf(self,X):
        ss=len(X)
        res=np.empty((len(X),0))
        lp=np.where(X>self.means_[0],-X+self.means_[0],-np.inf)
        #lp=-0.5*(np.log(2*np.pi)+X**2-2*self.means_[0]*X+self.means_[0]**2)
        res=np.concatenate((res,lp[:,np.newaxis]),axis=1)
        lp=-0.5*(np.log(2*np.pi)+X**2-2*self.means_[1]*X+self.means_[1]**2)
        res=np.concatenate((res,lp[:,np.newaxis]),axis=1)
        return res
        
    def score_samples(self, X):
        lpr = (self.log_pdf(X)+np.log(self.weights_))
        logprob = logsumexp(lpr, axis=1)
        res = np.exp(lpr - logprob[:, np.newaxis])
        return logprob, res
    
    def fit(self,X):
        current_log_likelihood = None
        self.converged_ = False
        thresh = self.thresh
               
        for i in range(self.n_iter):
            prev_log_likelihood = current_log_likelihood
            log_likelihoods, res = self.score_samples(X)
            current_log_likelihood = log_likelihoods.mean()
            
            if prev_log_likelihood is not None:
                change = current_log_likelihood - prev_log_likelihood
                if change < thresh:
                    self.converged_ = True
                    #print 'After ', i, ' iters,\nEM algorithm converged.'
                    break
            self.res = res        
            self._do_mstep(X)
        return np.round(self.res)
            
    def _do_mstep(self,X):
        weights = self.res.sum(axis=0)
        self.weights_ = self.res.sum(axis=0)/self.res.sum() + EPS
        weighted_X_sum = np.dot(self.res.T, X)
        self.means_ = weighted_X_sum / (weights + EPS)
        return

In [4]:
df = pd.read_csv('labeled_sina_new.csv')

In [5]:
df2=df.loc[df['state'].isin([1,3]),['ewav_back buy/sell','state']]

In [6]:
X=df2['ewav_back buy/sell']

In [7]:
g=GEM()

In [8]:
r=g.fit(X)

In [9]:
g.weights_

array([ 0.33093432,  0.66906568])

In [10]:
g.means_

array([ 97.16034559, -37.17418133])

In [11]:
y_pred = np.where(r[:,0]==1.0,3,1)

In [12]:
y_true = df2['state']

In [13]:
confusion_matrix(y_true, y_pred)

array([[123,  16],
       [ 63,  76]])