In [1]:
import glob
import math
from statistics import *
from sklearn import linear_model
from scipy.sparse import csr_matrix
import numpy as np
import random
#using naive Bayes for Classification
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn import cross_validation
from sklearn import feature_selection
from sklearn.feature_selection import chi2
#from operator import add;
seed=1003
random.seed(seed)
path2Data="cities/"

In [2]:
# reads the POS tags of the tweets in path2Data
# Since numbers and such give many unique ''words'' we can specify which to replace
# currently replacing '#'-hashtags, '@'-usernames, 'U'-url links, 'E'-emoticons, '$'- numeral , ','-punctuations, 'G'-unknown tag
# we are removing some formating symbols eg. ":" which is tagged to '~'

replaceables = ['#', '@', 'U', 'E', '$', ',', 'G']
#replaceables = []
removables = ['~']

def cleanTweet(tweet, tweet_pos):
    tweet_l = tweet.split()
    tweet_pos_l = tweet_pos.split()

    if len(tweet_l) != len(tweet_pos_l):
        for i, item in enumerate(tweet_l):
            print (tweet_l[i], ',' , tweet_pos_l[i])
        
    clean_tweet = []
    for i, item in enumerate(tweet_l):
        #print (item)
        #print (tweet_pos_l[i])
        if tweet_pos_l[i] in replaceables:
            clean_tweet.append(tweet_pos_l[i])
        elif tweet_pos_l[i] in removables:
            None
        else:
            clean_tweet.append(item.lower())
    
    #print (clean_tweet)
    return clean_tweet

# Version 2: generating Valid Keys for corresponding to the top/bottom HIV rates
#sample size is 2 (p*N), p*N for lower and p*N for upper
def sampleItems(locRates, p):
    items = []
    for key in locRates:
        items.append((key, locRates[key]))

    sorted_locRates = sorted(items, key=lambda student: student[1]) 
    total_items = len(items)
    sampleSize = (int) (p*total_items)
    #print ("sample size: ", sampleSize)
    
    ret = []
    lab = {}
    for i, item in enumerate(sorted_locRates):
        if i < sampleSize:
            ret.append(item[0])
            lab[item[1]] = 0
            
        if i >= total_items - sampleSize:
            ret.append(item[0])
            #lab.append(1)
            lab[item[1]] = 1
    
    #print ("samped items size: ", len(ret))
    return (ret,lab)
    

def tfidf(docID, wordID, tf, idf, N):
    tf_0 = 0.5 +  tf[docID].get(wordID, 0)
    idf_0 = math.log( 1 + N/len(idf[wordID]))
    #idf_0 = 1
    return (tf_0 * idf_0)

def conf_mat(Y_hat, Y):
    tp = fp = tn = fn = 0
    for i,j in zip(Y_hat, Y):
        if i == 1:
            if i == j:
                tp = tp + 1
            else:
                fp = fp + 1
        elif i == 0:
            if i == j:
                tn = tn + 1
            else: 
                fn = fn + 1
        else:
            print (" j should only be 0 or 1, however", j , "was encountered.")
    #print (tp, fp, tn, fn)
    return [tp, fp, tn, fn]

In [3]:
filenum = 0
Vcount=0
VVcount=0
Vinv={}          # index to word map
V={}             # word to index
VV={}
VVinv={}
idf={}           # forwardIndex
tf={}            # (word, numberOfWords)
bitf={}
locRates={}      # HIV rates based on locations

N = 0
for file in glob.glob(path2Data+'*.tsv'):
    filenum = filenum + 1  #serves as an index for the file name
    prefix = file.split('.')[0]
    locRates[filenum] = int(prefix.split('_')[-1])

    lines = []
    with open(file, 'r') as f:
        lines = f.readlines()
    #DEBUG
    #print (file + " file num: " + str(filenum) + " num tweets: " + str(len(lines)) )

    unique_words = set([])
    for line in lines:
        ll = line.split('\t')
        tweet = ll[0].strip()
        tweet_pos = ll[1].strip()

        prevWord = "<s>"
        for word in cleanTweet(tweet, tweet_pos):
            if word not in V:
                V[word]= Vcount
                Vinv[Vcount]=word
                Vcount = Vcount + 1

            #bigram 
#             if (prevWord,word) not in VV:
#                 VV[(prevWord, word)] = VVcount
#                 VVinv[VVcount] = (prevWord,word)
#                 VVcount = VVcount + 1

            if V[word] not in idf:
                idf[ V[word] ] = []

            if filenum not in tf:
                tf[filenum] = {}

            freq = tf[filenum].get(V[word], 0)
            tf[filenum][ V[word] ] = freq + 1

            if word not in unique_words:
                idf[ V[word] ].append(filenum)
                unique_words.add(word)

            #bigram
#             if filenum not in bitf:
#                 bitf[filenum] = {}

#             bitf[filenum][VV[(prevWord, word)]] = bitf[filenum].get(VV[(prevWord,word)], 0) + 1
#             prevWord = word

N = filenum
VocabSize=len(V.keys())


In [4]:
#some statistics about the data
print ("vocabSize: ", VocabSize)
print ("docSize: ", N)
print ("labels: ", len(locRates))
mu = mean(locRates.values())
print ("mean: ",  mu )
med = median(locRates.values())
print ("median: ",  median(locRates.values()) )
print ("max: ", max(locRates.values()) )
print ("min: ", min(locRates.values()) )
sigma = stdev(locRates.values())
print ("standard diviation: ", sigma )

vocabSize:  184183
docSize:  1504
labels:  1504
mean:  133.54255319148936
median:  82.5
max:  2084
min:  11
standard diviation:  160.6118525618946


In [5]:
#Regression
#NB classification 
x_cord = []

#sample top/bottom rates

sss = [0.25]
#sss= np.arange(0.05, 0.48, 0.02)
topPercent = 0.25
(validKeys, labels) = sampleItems(locRates, topPercent)
print ("top sample size: ", (int)(topPercent*N), " top: ", topPercent)
row = []
col = []
data = []
data_tf = []
Yreg = []
print (len(validKeys))
for i, docID in enumerate(validKeys):
    for wordID in tf[docID]:
        row.append(i)
        col.append(wordID)
        data.append(tfidf(docID, wordID, tf,idf, N) ) 
        data_tf.append(tf[docID][wordID])
#     # uncomment to use regression
    Yreg.append(locRates[docID]) 
    # used for classification
X = csr_matrix ( (np.array(data),(np.array(row),np.array(col))), shape=(len(validKeys),VocabSize), dtype=np.dtype('d'))
#X_tf = csr_matrix ( (np.array(data_tf),(np.array(row),np.array(col))), shape=(len(validKeys),VocabSize), dtype=float)
print (X[row[0], col[0]])
print ( Yreg[0] )
#bigram
#X = csr_matrix ( (np.array(data),(np.array(row),np.array(col))), shape=(len(trainIndices),VocabSize+len(VV.keys())), dtype=float)
print ("shape:", X.shape)

top sample size:  376  top:  0.25
752
3.40070866448
11
shape: (752, 184183)


In [8]:
#clf = MultinomialNB()
clf_v2 = linear_model.ElasticNetCV(l1_ratio=[0.75, 0.80, 0.85, 0.90, 0.95], n_jobs=3, cv=5, alphas=np.array([0.1, 1.0, 10, 100, 1000, 10000, 100000]))
#print (clf_v2.get_params())
clf_v2.fit(X, Yreg)
#alphas_enet, coefs_enet, _ = linear_model.enet_path(  X, np.array(Yreg, dtype=np.dtype('d')), eps=0.005, l1_ratio=0.8, fit_intercept=False)

#clf = linear_model.Lasso(alpha=0.1)
#clf = linear_model.SGDClassifier()
#     train_errors = list()
#     test_errors = list()
#     alphas = np.logspace(-5, 1, 10)
#     y_ridge = np.array(Yreg)
#     K = 5
#     print (alphas)
#     k_fold = cross_validation.KFold(len(y_ridge), n_folds=K,shuffle=True, random_state=np.random.RandomState(seed))
#     #clf.set_params(alpha=alphas[0])
#     #clf.fit(X[train], y_ridge[train])
#     #for i, alpha in enumerate(alphas):
#     #    print (i, ":", alpha)
#     #    clf.set_params(alpha=alpha)
#     #    tr_err = 0
#     #    ts_err = 0
#     for train, test in k_fold:
#         print("training")
#         clf.fit(X[train], y_ridge[train])
#             #tr_err = tr_err + clf.score(X[train], y_ridge[train])
#             #ts_err = ts_err + clf.score(X[test], y_ridge[test])
#             #clf.fit(X_tf[train], y[train]).predict(X_tf[test])
#     train_errors.append(tr_err/K)
#     test_errors.append(ts_err/K)
    
#     #print ("average acc: {0:.5f}, average precision: {1:.5f}, average recall: {2:.5f}".format(acc/K, prec/K, recal/K))
#     #print ("average acc: {0:.5f}, average precision: {1:.5f}, average recall: {2:.5f}".format(acc_tf/K, prec_tf/K, recal_tf/K))
#     x_cord.append(topPercent)



ElasticNetCV(alphas=array([  1.00000e-01,   1.00000e+00,   1.00000e+01,   1.00000e+02,
         1.00000e+03,   1.00000e+04,   1.00000e+05]),
       copy_X=True, cv=5, eps=0.001, fit_intercept=True,
       l1_ratio=[0.75, 0.8, 0.85, 0.9, 0.95], max_iter=1000, n_alphas=100,
       n_jobs=3, normalize=False, positive=False, precompute='auto',
       tol=0.0001, verbose=0)

In [17]:
alphas_ = clf_v2.alphas_
print(alphas_)
print (len(alphas_))


[  1.00000000e+05   1.00000000e+04   1.00000000e+03   1.00000000e+02
   1.00000000e+01   1.00000000e+00   1.00000000e-01]
7


In [12]:
print (clf_v2.alpha_, clf_v2.l1_ratio_)

10.0 0.75


In [10]:
lines = []
with open( "regression/temp.txt", 'r') as f:
    lines = f.readlines()


['34010.18575435   28358.83068991   10406.48101463    4579.38876158    237957.05647627\n', '30921.21604569   27419.44834937    9601.86674996    8942.95226277    235101.15758987\n', '30953.77101752   35297.70375882    9473.71739998    9787.78209156    227894.00071841\n', '28185.07558869   33646.61283102    7590.73618035   14829.27974783    218610.94051325\n', '26174.72335631   49034.82634789    7213.39643558   18641.01673012    191651.62338366\n', '39714.34252438   43376.77380248   13421.39706272   38469.40537605    185517.9735573 \n', '51067.86669252   40782.66137981   17916.44018973   53832.67461295    201642.60098661\n', '34274.2352558    28522.76970872   10500.91887661    4429.00040861    237957.05647627\n', '30939.22877798   27417.31794429    9604.19162707    8898.27953273    235659.5726195 \n', '30927.97726823   34849.18465459    9490.71500407    9710.22516223    227944.22394889\n', '28217.81436152   33476.27746559    7634.79261262   14790.59618771    218813.36464958\n', '26255.04

In [35]:
alphas_ = [  1.00000000e+05 ,  1.00000000e+04 ,  1.00000000e+03   ,1.00000000e+02, 1.00000000e+01 ,  1.00000000e+00 ,  1.00000000e-01]
mse_ =[]
for line in lines:
    temporino = [ float(item) for item in line.split()]
    mse_.append(sum(temporino)/len(temporino))

In [26]:
print(mse_[:7])
print(mse_[7:14])

[63062.388539347994, 62397.328199532, 62681.394997257994, 60572.528972228, 58543.11725071199, 64099.978464586005, 73048.44877232399]
[63136.796145201995, 62503.71810031399, 62584.46520760199, 60586.569055404005, 58971.670481376, 64963.818946386, 74159.910938286]


In [23]:
with open ("regression/elastic_net_cv10_l1r90_alpha.txt", 'w') as f:
    for alpha in alphas_:
        f.write( "" + str(alpha) + "\n")

[ 104893.56605105   48687.28049004   22598.63374615   10489.35660511
    4868.728049      2259.86337462    1048.93566051     486.8728049
     225.98633746     104.89356605]


In [26]:
mse_path_var = clf_v2.mse_path_
mse1=[]
mse2=[]
mse3=[]
mse_avg=[]
#print (mse_path_)
with open ("regression/elastic_net_cv10_l1r90_var_mse.txt", 'w') as f:
    for c in mse_path_var:
        f.write( str(c) + "\n") 
        mse_avg.append( sum(c)/ len(c))
#         f.write( str(cv1) + "," + str(cv2) + ","+str(cv3) + "\n")
#         mse1.append(cv1)
#         mse2.append(cv2)
#         mse3.append(cv3)
#         mse_avg.append((cv1+cv2+cv3)/3)

In [19]:
coef_ = clf_v2.coef_
len(coef_)
with open ("regression/elastic_net_cv10_l1r90_coef.txt", 'w') as f:
    for c in coef_:
        f.write( "" + str(c) + "\n")

In [21]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import lasso_path, enet_path
from sklearn import datasets

In [53]:
#[0.75, 0.8, 0.85, 0.9, 0.95]
plt.figure()

plt.title('Elastic-Net MSE l1-ratios [0.75, 0.8, 0.85, 0.9, 0.95]')
# plt.subplot(321)
plt.plot(np.log10(alphas_), np.log10(mse_[0:7]), ':' , label='0.75')
# plt.xlabel('Log(alpha)')
# plt.ylabel('Log(MSE)')
# plt.subplot(322)
plt.plot(np.log10(alphas_), np.log10(mse_[7:14]), '--' , label='0.80')

# plt.xlabel('Log(alpha)')
# plt.ylabel('Log(MSE)')
# #plt.title('Elastic-Net MSE l1-ratio  0.80')
# plt.subplot(323)
plt.plot(np.log10(alphas_), np.log10(mse_[14:21]), 'o-', label='0.85' )

# plt.xlabel('Log(alpha)')
# plt.ylabel('Log(MSE)')
# #plt.title('Elastic-Net MSE l1-ratio  0.85')
# plt.subplot(324)
plt.plot(np.log10(alphas_), np.log10(mse_[21:28]), '.-', label='0.90' )

# plt.xlabel('Log(alpha)')
# plt.ylabel('Log(MSE)')
# #plt.title('Elastic-Net MSE l1-ratio  0.90')
# plt.subplot(325)
plt.plot(np.log10(alphas_), np.log10(mse_[28:]), label='0.95' )

plt.xlabel('Log(alpha)')
plt.ylabel('Log(MSE)')
#plt.title('Elastic-Net MSE l1-ratio  0.95')
plt.legend()
#plt.axis('tight')
plt.show()

In [51]:
plt.figure(2)
plt.plot(np.log10(alphas_), np.log10(mse2) )

plt.xlabel('-Log(alpha)')
plt.ylabel('mse')
plt.title('Lasso and Elastic-Net Paths')
#plt.legend(l2, 'Elastic-Net', loc='lower left')
#plt.axis('tight')
plt.show()

In [54]:
plt.figure(3)
plt.plot(np.log10(alphas_), np.log10(mse3) )

plt.xlabel('-Log(alpha)')
plt.ylabel('mse')
plt.title('Lasso and Elastic-Net Paths')
#plt.legend(l2, 'Elastic-Net', loc='lower left')
#plt.axis('tight')
plt.show()

In [44]:
mse_avg = [ sum(mse)/len(mse) for mse in clf_v2.mse_path_[1] ]

In [45]:
plt.figure(4)
plt.plot(np.log10(alphas_), np.log10(mse_avg) )
plt.xlabel('Log(alpha)')
plt.ylabel('Log(mse)')
plt.title('Elastic-Net Parameter selection via log_10(MSE)')
#plt.legend(l2, 'Elastic-Net', loc='lower left')
#plt.axis('tight')
plt.show()

In [84]:
plt.figure(5)
plt.plot(range(len(coef_)), coef_ )
plt.xlabel('Feature Number')
plt.ylabel('Weight')
plt.title('Features')
#plt.legend(l2, 'Elastic-Net', loc='lower left')
#plt.axis('tight')
plt.show()

In [64]:
c_list = []
for i, c in enumerate(coef_):
    c_list.append((i , c))

sorted_c = sorted(c_list, key=lambda student: student[1]) 


In [82]:
with open ("regression/elastic_net_cv3_l1r85_top100Words.txt", 'w') as f:
    for i in sorted_c[:50]:
        f.write( "{0:.5f}".format( i[1]) + "\t" +  Vinv [i[0]] + "\n")    
    for i in sorted_c[-50:]:
        f.write ( "{0:.5f}".format( i[1])+ "\t" +  Vinv [i[0]] + "\n")  

In [7]:
import time

import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LassoCV, LassoLarsCV, LassoLarsIC
from sklearn import datasets

diabetes = datasets.load_diabetes()
X = diabetes.data
y = diabetes.target

rng = np.random.RandomState(42)
X = np.c_[X, rng.randn(X.shape[0], 14)]  # add some bad features

# normalize data as done by Lars to allow for comparison
X /= np.sqrt(np.sum(X ** 2, axis=0))

print (X.shape)

# Compute paths
print("Computing regularization path using the coordinate descent lasso...")
t1 = time.time()
model = LassoCV(cv=20).fit(X, y)
t_lasso_cv = time.time() - t1

# Display results
m_log_alphas = -np.log10(model.alphas_)

plt.figure()
ymin, ymax = 2300, 3800
plt.plot(m_log_alphas, model.mse_path_, ':')
plt.plot(m_log_alphas, model.mse_path_.mean(axis=-1), 'k',
         label='Average across the folds', linewidth=2)
plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
            label='alpha: CV estimate')

plt.legend()

plt.xlabel('-log(alpha)')
plt.ylabel('Mean square error')
plt.title('Mean square error on each fold: coordinate descent '
          '(train time: %.2fs)' % t_lasso_cv)
plt.axis('tight')
plt.ylim(ymin, ymax)
plt.show()

(442, 24)
Computing regularization path using the coordinate descent lasso...


In [9]:
#NB classification 
x_cord = []
accuracies = []
precision = []
recall = []
tf_accuracies = []
tf_precision = []
tf_recall = []
#sample top/bottom rates
for curr in np.arange(0.05, 0.48, 0.02):
    topPercent = curr
    (validKeys, labels) = sampleItems(locRates, topPercent)
    print ("top sample size: ", (int)(topPercent*N), " top: ", topPercent)
    row = []
    col = []
    data = []
    data_tf = []
    Yclass = []
    Yreg = []

    for i, docID in enumerate(validKeys):
        for wordID in tf[docID]:
            row.append(i)
            col.append(wordID)
            data.append(tfidf(docID, wordID, tf,idf, N) ) 
            data_tf.append(tf[docID][wordID])
    #     # bigram
    #     for bigramID in bitf[docID]:
    #         row.append(i)
    #         col.append(VocabSize + bigramID)
    #         data.append(bitf[docID][bigramID])

    #     # uncomment to use regression
        Yreg.append(locRates[docID]) 
        # used for classification
        Yclass.append (labels[ locRates[docID] ])
    X = csr_matrix ( (np.array(data),(np.array(row),np.array(col))), shape=(len(validKeys),VocabSize), dtype=float)
    X_tf = csr_matrix ( (np.array(data_tf),(np.array(row),np.array(col))), shape=(len(validKeys),VocabSize), dtype=float)
    #bigram
    #X = csr_matrix ( (np.array(data),(np.array(row),np.array(col))), shape=(len(trainIndices),VocabSize+len(VV.keys())), dtype=float)
    #print (X.shape)

    clf = MultinomialNB()
    #clf = linear_model.SGDClassifier()
    y = np.array(Yclass)
    y_ridge = np.array(Yreg)
    K = 5
    acc = 0
    prec = 0
    recal = 0
    acc_tf=0
    prec_tf=0
    recal_tf=0
    k_fold = cross_validation.StratifiedKFold(Yclass, n_folds=K,shuffle=True, random_state=np.random.RandomState(seed))
    avg_ela = 0
    terms = []
    tf_terms = []
    for k, (train, test) in enumerate(k_fold):
        Y_hat = clf.fit(X[train], y[train]).predict(X[test])
        Y_tf_hat = clf.fit(X_tf[train], y[train]).predict(X_tf[test])
#         ch2 = feature_selection.SelectKBest(feature_selection.chi2, k=150)
#         X_new = ch2.fit_transform(X, y)
#         #X_test = ch2.transform(X[test])
#         Y_hat = clf.fit(X_new[train], y[train]).predict(X_new[test])
#         if (k == 0 ):
#             terms = [ Vinv[feature] for feature in ch2.get_support(indices=True) ] 
#             print (terms)
            
        #for feature in ch2.get_support(indices=True):
        #	print ( Vinv[feature] )
        
#         X_tf_new = ch2.fit_transform(X_tf,y)
#         Y_tf_hat = clf.fit(X_tf_new[train], y[train]).predict(X_tf_new[test])
#         if (k == 0 ):
#             terms_tf = [ Vinv[feature] for feature in ch2.get_support(indices=True) ] 
#             print (terms_tf)
        
        cm = conf_mat(Y_hat, y[test])
        cm_tf = conf_mat(Y_tf_hat, y[test])
        
        acc = (cm[0]+cm[2])/(len(Y_hat)) + acc
        prec = prec + cm[0]/max(cm[0]+cm[1],1)
        recal = recal + cm[0]/max(cm[0]+cm[3],1)
        
        acc_tf = (cm_tf[0]+cm_tf[2])/(len(Y_tf_hat)) + acc_tf
        prec_tf = prec_tf + cm_tf[0]/max(cm_tf[0]+cm_tf[1],1)
        recal_tf = recal_tf + cm_tf[0]/max(cm_tf[0]+cm_tf[3],1)

    print ("average acc: {0:.5f}, average precision: {1:.5f}, average recall: {2:.5f}".format(acc/K, prec/K, recal/K))
    print ("average acc: {0:.5f}, average precision: {1:.5f}, average recall: {2:.5f}".format(acc_tf/K, prec_tf/K, recal_tf/K))
    x_cord.append(topPercent)
    accuracies.append(acc/K)
    precision.append(prec/K)
    recall.append(recal/K)
    tf_accuracies.append(acc_tf/K)
    tf_precision.append(prec_tf/K)
    tf_recall.append(recal_tf/K)


sample size:  75
top sample size:  75  top:  0.05
(150, 184183)
average acc: 0.51333, average precision: 0.50690, average recall: 1.00000
average acc: 0.50000, average precision: 0.50000, average recall: 1.00000
sample size:  105
top sample size:  105  top:  0.07
(210, 184183)
average acc: 0.57619, average precision: 0.54882, average recall: 0.88571
average acc: 0.50000, average precision: 0.50000, average recall: 1.00000
sample size:  135
top sample size:  135  top:  0.09
(270, 184183)
average acc: 0.58519, average precision: 0.56230, average recall: 0.78519
average acc: 0.50000, average precision: 0.50000, average recall: 1.00000
sample size:  165
top sample size:  165  top:  0.11
(330, 184183)
average acc: 0.58182, average precision: 0.57139, average recall: 0.67879
average acc: 0.50000, average precision: 0.50000, average recall: 1.00000
sample size:  195
top sample size:  195  top:  0.13
(390, 184183)
average acc: 0.58974, average precision: 0.58490, average recall: 0.61538
averag

In [26]:

fe_accuracies = np.empty([22,40])
fe_tf_accuracies = np.empty([22,40])
#sample top/bottom rates
for ia, curr in enumerate(np.arange(0.05, 0.48, 0.02)):
    topPercent = curr
    (validKeys, labels) = sampleItems(locRates, topPercent)
    print ("top sample size: ", (int)(topPercent*N), " top: ", topPercent)
    row = []
    col = []
    data = []
    data_tf = []
    Yclass = []
    Yreg = []

    for i, docID in enumerate(validKeys):
        for wordID in tf[docID]:
            row.append(i)
            col.append(wordID)
            data.append(tfidf(docID, wordID, tf,idf, N) ) 
            data_tf.append(tf[docID][wordID])

        #     # uncomment to use regression
        Yreg.append(locRates[docID]) 
        # used for classification
        Yclass.append (labels[ locRates[docID] ])
    X = csr_matrix ( (np.array(data),(np.array(row),np.array(col))), shape=(len(validKeys),VocabSize), dtype=float)
    X_tf = csr_matrix ( (np.array(data_tf),(np.array(row),np.array(col))), shape=(len(validKeys),VocabSize), dtype=float)
    
    
    clf = MultinomialNB()
    #clf = linear_model.SGDClassifier()
    y = np.array(Yclass)
    #y_ridge = np.array(Yreg)
    K = 5
    k_fold = cross_validation.StratifiedKFold(Yclass, n_folds=K,shuffle=True, random_state=np.random.RandomState(seed))
    #avg_ela = 0
    #terms = []
    #tf_terms = []
    for ja, topFE in enumerate(range(100, 4100, 100)):
        acc = 0
        acc_tf=0
        for k, (train, test) in enumerate(k_fold):
            #Y_hat = clf.fit(X[train], y[train]).predict(X[test])
            #Y_tf_hat = clf.fit(X_tf[train], y[train]).predict(X_tf[test])
            ch2 = feature_selection.SelectKBest(feature_selection.chi2, k=topFE)
            X_new = ch2.fit_transform(X, y)
             #X_test = ch2.transform(X[test])
            Y_hat = clf.fit(X_new[train], y[train]).predict(X_new[test])
    
            X_tf_new = ch2.fit_transform(X_tf,y)
            Y_tf_hat = clf.fit(X_tf_new[train], y[train]).predict(X_tf_new[test])
    
            cm = conf_mat(Y_hat, y[test])
            cm_tf = conf_mat(Y_tf_hat, y[test])

            acc = (cm[0]+cm[2])/(len(Y_hat)) + acc
            acc_tf = (cm_tf[0]+cm_tf[2])/(len(Y_tf_hat)) + acc_tf
        
        #x_cord.append(topPercent)
        fe_accuracies[ia][ja] = acc/K
#         precision.append(prec/K)
#         recall.append(recal/K)
        fe_tf_accuracies[ia][ja]=acc_tf/K
#         tf_precision.append(prec_tf/K)
#         tf_recall.append(recal_tf/K)
    

top sample size:  75  top:  0.05
top sample size:  105  top:  0.07
top sample size:  135  top:  0.09
top sample size:  165  top:  0.11
top sample size:  195  top:  0.13
top sample size:  225  top:  0.15
top sample size:  255  top:  0.17
top sample size:  285  top:  0.19
top sample size:  315  top:  0.21
top sample size:  345  top:  0.23
top sample size:  376  top:  0.25
top sample size:  406  top:  0.27
top sample size:  436  top:  0.29
top sample size:  466  top:  0.31
top sample size:  496  top:  0.33
top sample size:  526  top:  0.35
top sample size:  556  top:  0.37
top sample size:  586  top:  0.39
top sample size:  616  top:  0.41
top sample size:  646  top:  0.43
top sample size:  676  top:  0.45
top sample size:  706  top:  0.47


  chisq /= f_exp


In [27]:
temp = fe_accuracies

In [28]:
ttt = temp.flatten()

In [60]:
ttt[0]

0.59333333333333327

In [29]:
xs = []
ys = []
for ia, curr in enumerate(np.arange(0.05, 0.48, 0.02)):
    for ja, topFE in enumerate(range(100, 4100, 100)):
        xs.append(((int)(curr*N))*2)
        ys.append(topFE)
        

In [31]:
import matplotlib as mpl
from matplotlib import cm
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
#mpl.rcParams['title.fontsize'] = 10
fig = plt.figure()
ax = fig.gca(projection='3d')

ax.plot_trisurf(xs, ys, ttt, cmap=cm.jet, linewidth=0.2)
ax.legend()

ax.set_xlabel('Sample size')
ax.set_ylabel('top K best features')
ax.set_zlabel('Accuracy')
ax.set_title('Naive Bayes Classification 5-fold Cross Validation')
plt.show()



In [63]:
with open ('tables/acc_chi2_sampleSize.txt' , 'w') as f:
    f.write("sampleSize, K, accuracy  \n")
    for i in range(len(xs)):
        #print (xs[i], ys[i], ttt[i])
        f.write( "{0:10d}, {1:10d}, {2:.3f}\n".format(xs[i], ys[i], ttt[i]))

In [32]:

import numpy as np

mpl.rcParams['legend.fontsize'] = 10

theta = np.linspace(-4 * np.pi, 4 * np.pi, 100)
z = np.linspace(-2, 2, 100)
r = z**2 + 1
x = r * np.sin(theta)
y = r * np.cos(theta)
ax.plot(x, y, z, label='parametric curve')
ax.legend()

plt.show()


array([ 0.59333333,  0.63333333,  0.65333333,  0.64666667,  0.60666667,
        0.65333333,  0.64666667,  0.64666667,  0.61333333,  0.58      ,
        0.58666667,  0.58      ,  0.57333333,  0.59333333,  0.60666667,
        0.61333333,  0.62      ,  0.62      ,  0.62      ,  0.62      ,
        0.57142857,  0.5952381 ,  0.5952381 ,  0.6       ,  0.6047619 ,
        0.5952381 ,  0.6047619 ,  0.60952381,  0.60952381,  0.6047619 ,
        0.59047619,  0.59047619,  0.58571429,  0.5952381 ,  0.6047619 ,
        0.58095238,  0.58571429,  0.5952381 ,  0.58571429,  0.5952381 ,
        0.56666667,  0.59259259,  0.6037037 ,  0.59259259,  0.57037037,
        0.57037037,  0.57407407,  0.57037037,  0.58148148,  0.58518519,
        0.57407407,  0.57777778,  0.56666667,  0.56296296,  0.57037037,
        0.55555556,  0.55185185,  0.57407407,  0.56666667,  0.57037037,
        0.55454545,  0.55454545,  0.56969697,  0.56969697,  0.58484848,
        0.57272727,  0.57575758,  0.58787879,  0.58484848,  0.57

In [29]:
fe_accuracies[0][1]

1.2266666666666666

In [109]:
print (terms_tf)

['your', 'no', 'today', ',', 'you', 'more', 'for', 'U', 'i', 'love', 'a', 'shit', "don't", 'know', 'why', '@', 'our', 'are', 'getting', '#', 'still', 'it', 'to', 'do', 'and', 'have', 'but', 'big', 'time', 'feel', 'all', 'the', 'of', 'is', 'right', 'in', 'real', 'back', 'on', 'at', '$', 'then', 'off', 'just', 'god', 'where', 'u', '&', 'made', 'by', 'that', 'always', 'was', "i'm", 'day', 'so', 'E', 'G', 'did', 'my', 'good', 'how', 'use', 'her', 'here', 'new', 'too', 'say', 'with', 'out', 'there', 'via', 'want', 'go', 'up', 'as', 'if', 'much', 'rt', 'well', 'not', "it's", 'this', 'great', 'has', 'or', 'check', 'who', 'any', 'will', 'never', 'get', 'been', 'night', 'think', 'she', 'stop', 'home', 'tonight', 'after', 'me', 'what', 'would', 'am', 'now', 'fuck', 'can', 'only', 'than', 'from', 'something', 'he', 'first', 'im', 'had', 'like', 'work', 'really', 'they', 'about', 'blue', 'black', 'come', 'free', "that's", 'us', 'be', 'make', 'lol', 'white', "can't", 'game', 'should', 'over', 'man'

In [118]:
accuracies_150 = accuracies
accuracies_tf_150 = tf_accuracies

In [5]:
idf_accuracies_no_filter = accuracies
tf_accu_no_filter = tf_accuracies

In [10]:
import matplotlib.pyplot as plt
plt.plot(x_cord, accuracies, label="tfidf+POS")
plt.plot(x_cord, tf_accuracies, label="tf+POS")
#plt.plot(x_cord, accuracies_150, label="tfidf + POS")
#plt.plot(x_cord, accuracies_tf_150, label="tf + POS")
plt.plot(x_cord, idf_accuracies_no_filter, label="tfidf")
plt.plot(x_cord, tf_accu_no_filter, label="tf")
plt.xlabel('top/botom n% HIV rates')
plt.ylabel('Accuracy')
plt.title('Naive Bayes Classification')
plt.legend()
plt.show()

In [59]:
import matplotlib.pyplot as plt
plt.plot(x_cord, precision, label="tfidf + POS filter")
plt.plot(x_cord, tf_precision, label="tf + POS filter")
plt.xlabel('top/botom n% HIV rates')
plt.ylabel('Precision')
plt.legend()
plt.show()


In [61]:
import matplotlib.pyplot as plt
plt.plot(x_cord, recall, label="tfidf + POS filter")
plt.plot(x_cord, tf_recall, label="tf + POS filter")
plt.xlabel('top/botom n% HIV rates')
plt.ylabel('Recall')
plt.legend()
plt.show()

In [72]:
f1 = []
f1_tf = []
for p, r in zip(precision, recall):
    f1.append( 2*(p*r)/(p+r))
    
for p, r in zip(tf_precision, tf_recall):
    f1_tf.append( 2*(p*r)/(p+r))
    

In [17]:
with open ('tables/multinomail_accuracy.txt', 'w') as f:
    f.write("sample size, tfidf+POS, tf+POS, tfidf, tf  \n")
    for i in range(len(x_cord)):
        f.write( "{0:10d}, {1:0.3f}, {2:.3f}, {3:.3f}, {4:.3f}\n".format((int)(x_cord[i]*N*2), accuracies[i], tf_accuracies[i], idf_accuracies_no_filter[i], tf_accu_no_filter[i]))
#         f.write( ','.join(format(x, "2.3f") for x in accuracies) + '\n')
#         f.write( ','.join(format(x, "2.3f") for x in tf_accuracies) + '\n')
#         f.write( ','.join(format(x, "2.3f") for x in idf_accuracies_no_filter) + '\n')
#         f.write( ','.join(format(x, "2.3f") for x in tf_accu_no_filter) + '\n')

In [18]:
# save for experimentation with the size of feature extraction
accuracy_no_fe = accuracies



In [13]:
[i for i in range(len(x_cord))]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]

In [20]:
len(x_cord)

22

In [32]:
lines = []
with open('tables/multinomail_accuracy.txt', 'r') as f:
    lines = f.readlines()
    
sampleSize = []
tfidfPOS = []
tfPOS = []
tfidf = []
tf = []
for i, line in enumerate(lines):
    if i > 0:
        temp = line.strip().split(',')
        sampleSize.append(int(temp[0]))
        tfidfPOS.append(float(temp[1]))
        tfPOS.append(float(temp[2]))
        tfidf.append(float(temp[3]))
        tf.append(float(temp[4]))



In [33]:
import matplotlib.pyplot as plt
plt.plot(sampleSize, tfidfPOS, label="tfidf+POS")
plt.plot(sampleSize, tfPOS, label="tf+POS")
#plt.plot(x_cord, accuracies_150, label="tfidf + POS")
#plt.plot(x_cord, accuracies_tf_150, label="tf + POS")
plt.plot(sampleSize, tfidf, label="tfidf")
plt.plot(sampleSize, tf, label="tf")
plt.xlabel('total (training + testing) sample size ')
plt.ylabel('Accuracy')
plt.title('Naive Bayes Classifier 5-fold Cross Validation')
plt.legend()
plt.show()

In [2]:
print (lines)

['sample size, tfidf+POS, tf+POS, tfidf, tf  \n', '       150, 0.513, 0.500, 0.493, 0.500\n', '       210, 0.576, 0.500, 0.476, 0.500\n', '       270, 0.585, 0.500, 0.489, 0.500\n', '       330, 0.582, 0.500, 0.500, 0.500\n', '       391, 0.590, 0.500, 0.503, 0.500\n', '       451, 0.567, 0.500, 0.513, 0.500\n', '       511, 0.559, 0.500, 0.508, 0.500\n', '       571, 0.558, 0.498, 0.514, 0.500\n', '       631, 0.552, 0.498, 0.514, 0.500\n', '       691, 0.541, 0.499, 0.514, 0.501\n', '       752, 0.565, 0.496, 0.505, 0.499\n', '       812, 0.543, 0.501, 0.512, 0.499\n', '       872, 0.547, 0.499, 0.509, 0.499\n', '       932, 0.542, 0.503, 0.506, 0.498\n', '       992, 0.542, 0.504, 0.504, 0.498\n', '      1052, 0.547, 0.503, 0.510, 0.502\n', '      1112, 0.551, 0.505, 0.506, 0.502\n', '      1173, 0.568, 0.505, 0.512, 0.502\n', '      1233, 0.555, 0.504, 0.516, 0.498\n', '      1293, 0.563, 0.509, 0.516, 0.498\n', '      1353, 0.559, 0.508, 0.514, 0.501\n', '      1413, 0.570, 0.504,

In [74]:
import matplotlib.pyplot as plt
plt.plot(x_cord, f1, label="tfidf + POS filter")
plt.plot(x_cord, f1_tf, label="tf + POS filter")
plt.xlabel('top/botom n% HIV rates')
plt.ylabel('F1')
plt.legend()
plt.show()

In [9]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import lasso_path, enet_path
from sklearn import datasets

diabetes = datasets.load_diabetes()
X = diabetes.data
y = diabetes.target

In [10]:
X.shape

(442, 10)