In [43]:
import pandas as pd 
import numpy as np
import scipy
from scipy.stats import pearsonr
from sklearn import preprocessing
import scipy.sparse as sp
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, accuracy_score
from sklearn.linear_model import ElasticNet
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn import cross_validation
import xlsxwriter
from sklearn import linear_model, grid_search, datasets
from operator import itemgetter

import warnings
warnings.filterwarnings('ignore')

#### Haemophilus

In [44]:
## Other disease Wikipedia Data

%cd /home/tanlab1/Desktop/MultipleDisease/sonsonson/correlations/combined/LAST_OFFSET_WORK/Other_disease_WIKI_161_week/

wiki_train_data = pd.read_csv("Haemophilus.csv")
weekly_data = wiki_train_data[21:].groupby(np.arange(len(wiki_train_data[21:]))//7).sum()
article_names = ['Haemophilus',
'Haemophilus_influenzae',
'Gram-negative_bacteria',
'Facultative_anaerobic_organism',
'Epiglottitis',
'Cellulitis',
'Meningitis',
'Pathogenic_bacteria',
'Hib_vaccine',
'Osteomyelitis'
]
selected_wiki_articles = ['Haemophilus','Haemophilus_influenzae','Gram-negative_bacteria',
                          'Facultative_anaerobic_organism','Epiglottitis','Hib_vaccine','Osteomyelitis']
other_disease_normalized_weekly_wiki_data = pd.DataFrame()
for i in weekly_data:
    # Create x, where x the 'scores' column's values as floats
    x = weekly_data[i].values.astype(float)
    # Create a minimum and maximum processor object
    min_max_scaler = preprocessing.MinMaxScaler()
    # Create an object to transform the data to fit minmax processor
    x_scaled = min_max_scaler.fit_transform(x)
    # Run the normalizer on the dataframe
    df_normalized = pd.DataFrame(x_scaled)
    other_disease_normalized_weekly_wiki_data = pd.concat([other_disease_normalized_weekly_wiki_data, df_normalized], axis=1)
other_disease_normalized_weekly_wiki_data.columns=article_names
other_disease_normalized_weekly_wiki_data = other_disease_normalized_weekly_wiki_data[selected_wiki_articles]
other_disease_normalized_weekly_wiki_data.shape

/home/tanlab1/Desktop/MultipleDisease/sonsonson/correlations/combined/LAST_OFFSET_WORK/Other_disease_WIKI_161_week


(158, 7)

In [45]:
## Other disease Google Trend Dataset

%cd /home/tanlab1/Desktop/MultipleDisease/sonsonson/correlations/combined/LAST_OFFSET_WORK/Other_Disease_GT_normalized_csv_160_week/
gt_train_data = pd.read_csv("haemophilus_normalized.csv",header=None)
gt_train_data = gt_train_data[2:]
gt_train_data.shape

/home/tanlab1/Desktop/MultipleDisease/sonsonson/correlations/combined/LAST_OFFSET_WORK/Other_Disease_GT_normalized_csv_160_week


(158, 3)

In [46]:
## Flu disease Wiki Dataset

flu_article_names = ['en-Common_cold',
                     'en-Human_flu',
                     'en-Influenza',
                     'en-Influenza-like_illness',
                     'en-Influenza_A_virus',
                     'en-Influenza_A_virus_subtype_H1N1',
                     'en-Influenzavirus_A',
                     'en-Influenzavirus_C',
                     'en-Oseltamivir',
                     'en-Zanamivir']

%cd /home/tanlab1/Desktop/MultipleDisease/sonsonson/correlations/combined/LAST_OFFSET_WORK/Flu_WIKI_161_week/
flu_wiki_data = pd.read_csv("top10_Flu_daily_wiki_data_161_week.csv")
flu_weekly_wiki_data = flu_wiki_data[21:].groupby(np.arange(len(flu_wiki_data[21:]))//7).sum()
normalized_flu_weekly = pd.DataFrame()
for i in flu_weekly_wiki_data:
    # Create x, where x the 'scores' column's values as floats
    x = flu_weekly_wiki_data[i].values.astype(float)
    # Create a minimum and maximum processor object
    min_max_scaler = preprocessing.MinMaxScaler()
    # Create an object to transform the data to fit minmax processor
    x_scaled = min_max_scaler.fit_transform(x)
    # Run the normalizer on the dataframe
    df_normalized = pd.DataFrame(x_scaled)
    normalized_flu_weekly = pd.concat([normalized_flu_weekly, df_normalized], axis=1)
normalized_flu_weekly.columns=flu_article_names
normalized_flu_weekly.shape

/home/tanlab1/Desktop/MultipleDisease/sonsonson/correlations/combined/LAST_OFFSET_WORK/Flu_WIKI_161_week


(158, 10)

In [47]:
## Flu disease Google Flu Trend Dataset

%cd /home/tanlab1/Desktop/MultipleDisease/sonsonson/correlations/combined/LAST_OFFSET_WORK/Flu_GFT_DATA_160_week/
flu_google_flu_trend_data = pd.read_csv("GFT_160week.csv")
flu_google_flu_trend_data = flu_google_flu_trend_data[2:]
flu_google_flu_trend_data.shape

/home/tanlab1/Desktop/MultipleDisease/sonsonson/correlations/combined/LAST_OFFSET_WORK/Flu_GFT_DATA_160_week


(158, 30)

In [48]:
## Combination of all training datasets
'''
c1 = sp.csr_matrix(other_disease_normalized_weekly_wiki_data)
c2 = sp.csr_matrix(gt_train_data)
c3 = sp.csr_matrix(normalized_flu_weekly)
c4 = sp.csr_matrix(flu_google_flu_trend_data)
h = sp.hstack((c1, c2,c3,c4), format='csr')
combined_all_datasets = h.A
print combined_all_datasets.shape
'''


combined_all_datasets = pd.DataFrame()
other_disease_normalized_weekly_wiki_data = other_disease_normalized_weekly_wiki_data.reset_index()
gt_train_data = gt_train_data.reset_index()
normalized_flu_weekly = normalized_flu_weekly.reset_index()
flu_google_flu_trend_data = flu_google_flu_trend_data.reset_index()
combined_all_datasets = pd.concat([other_disease_normalized_weekly_wiki_data, gt_train_data, normalized_flu_weekly, flu_google_flu_trend_data], axis=1)
del combined_all_datasets['index']
print combined_all_datasets.shape


(158, 50)


In [49]:
## Observed Data Set

%cd /home/tanlab1/Desktop/MultipleDisease/sonsonson/correlations/combined/LAST_OFFSET_WORK/
observation_data = pd.read_csv("ALL_OBSERVATION_DATA.csv")
selected_observation_disease = ['Flu', 'Haemophilus']

other_disease_normalized_weekly_wiki_data = observation_data[selected_observation_disease]
observation_data = observation_data[selected_observation_disease]

combined_all_datasets = combined_all_datasets.as_matrix()
observation_data = observation_data.as_matrix()

/home/tanlab1/Desktop/MultipleDisease/sonsonson/correlations/combined/LAST_OFFSET_WORK


In [50]:
r2,mse,mae,best_scores = getScore(combined_all_datasets, observation_data,2)

NameError: name 'getScore' is not defined

In [51]:
print r2[0::2]
print r2[1::2]
print sum(r2[0::2])/len(r2[0::2])
print sum(r2[1::2])/len(r2[1::2])

print mse[0::2]
print mse[1::2]
print sum(mse[0::2])/len(mse[0::2])
print sum(mse[1::2])/len(mse[1::2])

print mae[0::2]
print mae[1::2]
print sum(mae[0::2])/len(mae[0::2])
print sum(mae[1::2])/len(mae[1::2])

print best_scores

NameError: name 'r2' is not defined

In [52]:
def getScore(X,y, numberOfDisease):
    data = X
    gt = y
    r2 = []
    mse = []
    mae = []
    rep = []
    parameters = {"alpha": [1, 0.1, 0.01, 0.001, 0.0001],
                  "l1_ratio": [0, 0.25, 0.5,0.75,1],
              "fit_intercept": [True],
              "max_iter":[1000],
              "normalize":[True, False],
              "selection":['cyclic', 'random'],
              "tol":[0.0001, 0.001]}
    
    #cv = cross_validation.KFold(len(train), n_folds=10, shuffle=True)
    #for data,gt in cv:
       #print data, gt
    s = 0
    for random_generator in range(1,11):
        X_train, X_test, y_train, y_test = cross_validation.train_test_split(data, gt, test_size=0.3, random_state=random_generator)
        mtl = linear_model.MultiTaskElasticNet()
        grid_search = GridSearchCV(mtl, param_grid=parameters, n_jobs=-1)
        grid_search.fit(X_train, y_train)
        predictions = grid_search.predict(X_test)
        for i in range(0,numberOfDisease):
            s += 1
            r2.append(r2_score(y_test[:,i], predictions[:,i]))
            mse.append(mean_squared_error(y_test[:,i], predictions[:,i]))
            mae.append(mean_absolute_error(y_test[:,i], predictions[:,i]))

        temp = report(grid_search.grid_scores_)
        rep.append(temp)
    return r2, mse, mae, rep

In [53]:
def report(grid_scores, n_top=1):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    l = []
    for i, score in enumerate(top_scores):
        
        #print("Model with rank: {0}".format(i + 1))
        #print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
        #     score.mean_validation_score,
        #      np.std(score.cv_validation_scores)))
        #print("Parameters: {0}".format(score.parameters))
        #print("")
        l.append(score.mean_validation_score)
        l.append(np.std(score.cv_validation_scores))
        l.append(score.parameters)
    return l