This notebook reads in the cleaned training and testing data, and transforms them using count vectorizer, tfidf transformer, and then tsvd.  It returns csv files on various initial feature columns transformed through this process.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve,recall_score, precision_score, precision_recall_curve, confusion_matrix, classification_report
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.manifold import TSNE
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

In [6]:
df=pd.read_csv('training_data_cleansed.csv')
#df=pd.read_csv('training_data_cleansed_no_2009.csv')

In [9]:
print len (df)
df=df[df['year']!=2009]
print len(df)

43106
40876


In [10]:
df_to_predict=pd.read_csv('predicting_data_cleansed.csv')

In [11]:
nlp_features=[u'adverse_reactions', u'clinical_pharmacology',
       u'contraindications', u'description', u'dosage_and_administration',
       u'how_supplied', u'indications_and_usage', u'overdosage',
       u'spl_product_data_elements']

In [12]:
for col in nlp_features:
    df[col]=df[col].fillna('none')

In [13]:
for col in nlp_features:
    df_to_predict[col]=df_to_predict[col].fillna('none')

In [14]:
cv=CountVectorizer(max_features=1000)

In [15]:
count_1=cv.fit_transform(df['adverse_reactions'])

In [16]:
count_1_predict=cv.transform(df_to_predict['adverse_reactions'])

In [17]:
tfidfTrans=TfidfTransformer()

In [18]:
col_1=tfidfTrans.fit_transform(count_1)

In [19]:
col_1_predict=tfidfTrans.transform(count_1_predict)

In [20]:
col_2=tfidfTrans.fit_transform(cv.fit_transform(df['clinical_pharmacology']))


In [21]:
col_2_predict=tfidfTrans.transform(cv.fit_transform(df_to_predict['clinical_pharmacology']))


In [22]:
col_3=tfidfTrans.fit_transform(cv.fit_transform(df['contraindications']))
col_3_predict=tfidfTrans.transform(cv.fit_transform(df_to_predict['contraindications']))
col_4=tfidfTrans.fit_transform(cv.fit_transform(df['description']))
col_4_predict=tfidfTrans.transform(cv.fit_transform(df_to_predict['description']))
col_5=tfidfTrans.fit_transform(cv.fit_transform(df['dosage_and_administration']))
col_5_predict=tfidfTrans.transform(cv.fit_transform(df_to_predict['dosage_and_administration']))

col_6=tfidfTrans.fit_transform(cv.fit_transform(df['how_supplied']))
col_6_predict=tfidfTrans.transform(cv.fit_transform(df_to_predict['how_supplied']))

col_7=tfidfTrans.fit_transform(cv.fit_transform(df['indications_and_usage']))
col_7_predict=tfidfTrans.transform(cv.fit_transform(df_to_predict['indications_and_usage']))

col_8=tfidfTrans.fit_transform(cv.fit_transform(df['overdosage']))
col_8_predict=tfidfTrans.transform(cv.fit_transform(df_to_predict['overdosage']))

col_9=tfidfTrans.fit_transform(cv.fit_transform(df['spl_product_data_elements']))
col_9_predict=tfidfTrans.transform(cv.fit_transform(df_to_predict['spl_product_data_elements']))


In [23]:
df_1=pd.DataFrame(col_1.todense())
df_2=pd.DataFrame(col_2.todense())
df_3=pd.DataFrame(col_3.todense())
df_4=pd.DataFrame(col_4.todense())
df_5=pd.DataFrame(col_5.todense())
df_6=pd.DataFrame(col_6.todense())
df_7=pd.DataFrame(col_7.todense())
df_8=pd.DataFrame(col_8.todense())
df_9=pd.DataFrame(col_9.todense())

In [24]:
df_1_predict=pd.DataFrame(col_1_predict.todense())
df_2_predict=pd.DataFrame(col_2_predict.todense())
df_3_predict=pd.DataFrame(col_3_predict.todense())
df_4_predict=pd.DataFrame(col_4_predict.todense())
df_5_predict=pd.DataFrame(col_5_predict.todense())
df_6_predict=pd.DataFrame(col_6_predict.todense())
df_7_predict=pd.DataFrame(col_7_predict.todense())
df_8_predict=pd.DataFrame(col_8_predict.todense())
df_9_predict=pd.DataFrame(col_9_predict.todense())

In [25]:
tfidf_df=df_1.join(df_2,lsuffix='2_')

In [26]:
tfidf_df_predict=df_1_predict.join(df_2_predict,lsuffix='2_')

In [27]:
tfidf_df=tfidf_df.join(df_3,lsuffix='3_')
tfidf_df=tfidf_df.join(df_4,lsuffix='4_')
tfidf_df=tfidf_df.join(df_5,lsuffix='5_')
tfidf_df=tfidf_df.join(df_6,lsuffix='6_')
tfidf_df=tfidf_df.join(df_7,lsuffix='7_')
tfidf_df=tfidf_df.join(df_8,lsuffix='8_')
tfidf_df=tfidf_df.join(df_9,lsuffix='9_')

In [28]:
tfidf_df_predict=tfidf_df_predict.join(df_3_predict,lsuffix='3_')
tfidf_df_predict=tfidf_df_predict.join(df_4_predict,lsuffix='4_')
tfidf_df_predict=tfidf_df_predict.join(df_5_predict,lsuffix='5_')
tfidf_df_predict=tfidf_df_predict.join(df_6_predict,lsuffix='6_')
tfidf_df_predict=tfidf_df_predict.join(df_7_predict,lsuffix='7_')
tfidf_df_predict=tfidf_df_predict.join(df_8_predict,lsuffix='8_')
tfidf_df_predict=tfidf_df_predict.join(df_9_predict,lsuffix='9_')

In [29]:
tfidf_df.head()

Unnamed: 0,02_,12_,22_,32_,42_,52_,62_,72_,82_,92_,...,990,991,992,993,994,995,996,997,998,999
0,0.0,0.019856,0.0,0.0,0.022918,0.0,0.0,0.0,0.023243,0.023396,...,0.0,0.0,0.0,0.0,0.0,0.06364,0.0,0.0,0.0,0.0
1,0.0,0.075482,0.0,0.0,0.043561,0.083854,0.0,0.045409,0.044178,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.055289,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.012362,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
tsvd=TruncatedSVD(n_components=657)
tsvd_cols=tsvd.fit_transform(tfidf_df.values)

In [132]:
tsvd=TruncatedSVD(n_components=100)
tsvd.fit_transform(col_9)
exp_var=tsvd.explained_variance_ratio_
print np.cumsum(exp_var)

[ 0.01388051  0.04101972  0.06346547  0.08289745  0.09899737  0.1135032
  0.12646233  0.13904817  0.15106418  0.1621791   0.17269911  0.1830197
  0.1930949   0.20211602  0.21099519  0.21940747  0.22726027  0.23497688
  0.24259783  0.24977448  0.25677314  0.26362323  0.27031256  0.27673109
  0.28284354  0.28872895  0.29441557  0.30005902  0.3056312   0.31105516
  0.31625884  0.3213167   0.32626805  0.33103476  0.33574375  0.34039121
  0.34492223  0.34940907  0.35379704  0.35813524  0.36239569  0.36661132
  0.370695    0.37473085  0.37872625  0.38266465  0.3865304   0.39033914
  0.39404026  0.397732    0.40139201  0.40500791  0.40859868  0.412098
  0.41557696  0.41901803  0.42240363  0.42568574  0.42889609  0.4320922
  0.43527718  0.43838246  0.44145281  0.44449092  0.4474718   0.45043295
  0.45334079  0.45619726  0.45903318  0.46186255  0.46466024  0.46742858
  0.47017707  0.47287888  0.47555615  0.47820901  0.48081797  0.48337053
  0.48591531  0.48839517  0.4908489   0.49328084  0.4957

In [133]:
tsvd1_60=TruncatedSVD(n_components=7)
tsvd1_70=TruncatedSVD(n_components=25)

In [31]:
tsvd1=TruncatedSVD(n_components=100)
tsvd2=TruncatedSVD(n_components=100)
tsvd3=TruncatedSVD(n_components=100)
tsvd4=TruncatedSVD(n_components=100)
tsvd5=TruncatedSVD(n_components=100)
tsvd6=TruncatedSVD(n_components=100)
tsvd7=TruncatedSVD(n_components=100)
tsvd8=TruncatedSVD(n_components=100)
tsvd9=TruncatedSVD(n_components=100)

In [32]:
tsvd_col_1=tsvd1.fit_transform(col_1)
tsvd_col_2=tsvd2.fit_transform(col_2)
tsvd_col_3=tsvd3.fit_transform(col_3)
tsvd_col_4=tsvd4.fit_transform(col_4)
tsvd_col_5=tsvd5.fit_transform(col_5)
tsvd_col_6=tsvd6.fit_transform(col_6)
tsvd_col_7=tsvd7.fit_transform(col_7)
tsvd_col_8=tsvd8.fit_transform(col_8)
tsvd_col_9=tsvd9.fit_transform(col_9)

In [134]:
tsvd_col_1_60=tsvd1_60.fit_transform(col_1)
tsvd_col_1_70=tsvd1_70.fit_transform(col_1)

In [33]:
# print df_1.head()
# print df_1_predict.head()

In [34]:
tsvd_col_1_predict=tsvd1.transform(col_1_predict)

In [35]:
tsvd_col_2_predict=tsvd2.transform(col_2_predict)

In [135]:
tsvd_col_1_60_predict=tsvd1_60.transform(col_1_predict)
tsvd_col_1_70_predict=tsvd1_70.transform(col_1_predict)

In [36]:
type(col_2_predict)

scipy.sparse.csr.csr_matrix

In [37]:


tsvd_col_3_predict=tsvd3.transform(col_3_predict)
tsvd_col_4_predict=tsvd4.transform(col_4_predict)
tsvd_col_5_predict=tsvd5.transform(col_5_predict)
tsvd_col_6_predict=tsvd6.transform(col_6_predict)
tsvd_col_7_predict=tsvd7.transform(col_7_predict)
tsvd_col_8_predict=tsvd8.transform(col_8_predict)
tsvd_col_9_predict=tsvd9.transform(col_9_predict)

In [38]:
tsvd_cols_predict=tsvd.transform(tfidf_df_predict.values)

In [39]:
exp_var=tsvd.explained_variance_ratio_
print np.cumsum(exp_var)

[ 0.19791538  0.24728717  0.26753883  0.28286265  0.29489613  0.30494045
  0.31342184  0.32044392  0.32705252  0.33361219  0.33950157  0.34526281
  0.35083522  0.35639429  0.36155101  0.36653524  0.37132319  0.3759656
  0.38051623  0.3848128   0.38906135  0.39314469  0.39712804  0.40104353
  0.4049222   0.40866905  0.41233153  0.41584519  0.41922421  0.42253972
  0.42579926  0.42893992  0.43205019  0.43512371  0.4381303   0.44107238
  0.44397769  0.44682839  0.44964351  0.45235972  0.45504576  0.4576699
  0.46024243  0.46277527  0.46525028  0.46770379  0.47015284  0.47256797
  0.47496569  0.47730998  0.47961523  0.48188932  0.48415643  0.48638266
  0.48857768  0.49074014  0.49289194  0.49499695  0.49708337  0.49914281
  0.50116961  0.50316452  0.50515529  0.50712677  0.50907361  0.51098625
  0.5128777   0.51475186  0.51661987  0.51845533  0.52028001  0.52208931
  0.52388496  0.52566405  0.52739888  0.52912964  0.53083944  0.5325402
  0.53423215  0.53588603  0.5375085   0.53912032  0.54

In [40]:
tsvd_cols.shape

(40876, 657)

In [41]:
df.columns

Index([u'Unnamed: 0', u'adverse_reactions', u'clinical_pharmacology',
       u'contraindications', u'description', u'dosage_and_administration',
       u'how_supplied', u'indications_and_usage', u'overdosage',
       u'spl_product_data_elements', u'abuse', u'accessories', u'alarms',
       u'animal_pharmacology_and_or_toxicology', u'ask_doctor',
       u'ask_doctor_or_pharmacist', u'assembly_or_installation_instructions',
       u'carcinogenesis_and_mutagenesis_and_impairment_of_fertility',
       u'cleaning', u'clinical_studies', u'components',
       u'controlled_substance', u'dependence', u'diagram_of_device',
       u'disposal_and_waste_handling', u'do_not_use',
       u'dosage_forms_and_strengths', u'drug_abuse_and_dependence',
       u'drug_and_or_laboratory_test_interactions', u'drug_interactions',
       u'general_precautions', u'geriatric_use',
       u'guaranteed_analysis_of_feed', u'health_care_provider_letter',
       u'health_claim', u'information_for_owners_or_caregivers'

In [42]:
X=df[[u'abuse', u'accessories', u'alarms',
       u'animal_pharmacology_and_or_toxicology', u'ask_doctor',
       u'ask_doctor_or_pharmacist', u'assembly_or_installation_instructions',
       u'boxed_warning', u'calibration_instructions',
       u'carcinogenesis_and_mutagenesis_and_impairment_of_fertility',
       u'cleaning', u'clinical_studies', u'components',
       u'controlled_substance', u'dependence', u'diagram_of_device',
       u'disposal_and_waste_handling', u'do_not_use',
       u'dosage_forms_and_strengths', u'drug_abuse_and_dependence',
       u'drug_and_or_laboratory_test_interactions', u'drug_interactions',
       u'environmental_warning', u'food_safety_warning',
       u'general_precautions', u'geriatric_use',
       u'guaranteed_analysis_of_feed', u'health_care_provider_letter',
       u'health_claim', u'information_for_owners_or_caregivers',
       u'information_for_patients', u'instructions_for_use',
       u'intended_use_of_the_device', u'labor_and_delivery',
       u'laboratory_tests', u'mechanism_of_action', u'microbiology',
       u'nonclinical_toxicology', u'nonteratogenic_effects',
       u'nursing_mothers', u'other_safety_information',
       u'patient_medication_information', u'pediatric_use',
       u'pharmacodynamics', u'pharmacogenomics', u'pharmacokinetics',
       u'precautions', u'pregnancy', u'pregnancy_or_breast_feeding',
       u'questions', u'recent_major_changes', u'residue_warning', u'risks',
       u'route', u'safe_handling_warning', u'spl_indexing_data_elements',
       u'spl_medguide', u'spl_patient_package_insert',
       u'statement_of_identity', u'summary_of_safety_and_effectiveness',
       u'teratogenic_effects', u'troubleshooting',
       u'use_in_specific_populations', u'user_safety_warnings', u'version',
       u'veterinary_indications', u'warnings_and_cautions', u'when_using',
       u'date', u'month', u'year']]

In [43]:
X_predict=df_to_predict[[u'abuse', u'accessories', u'alarms',
       u'animal_pharmacology_and_or_toxicology', u'ask_doctor',
       u'ask_doctor_or_pharmacist', u'assembly_or_installation_instructions',
       u'boxed_warning', u'calibration_instructions',
       u'carcinogenesis_and_mutagenesis_and_impairment_of_fertility',
       u'cleaning', u'clinical_studies', u'components',
       u'controlled_substance', u'dependence', u'diagram_of_device',
       u'disposal_and_waste_handling', u'do_not_use',
       u'dosage_forms_and_strengths', u'drug_abuse_and_dependence',
       u'drug_and_or_laboratory_test_interactions', u'drug_interactions',
       u'environmental_warning', u'food_safety_warning',
       u'general_precautions', u'geriatric_use',
       u'guaranteed_analysis_of_feed', u'health_care_provider_letter',
       u'health_claim', u'information_for_owners_or_caregivers',
       u'information_for_patients', u'instructions_for_use',
       u'intended_use_of_the_device', u'labor_and_delivery',
       u'laboratory_tests', u'mechanism_of_action', u'microbiology',
       u'nonclinical_toxicology', u'nonteratogenic_effects',
       u'nursing_mothers', u'other_safety_information',
       u'patient_medication_information', u'pediatric_use',
       u'pharmacodynamics', u'pharmacogenomics', u'pharmacokinetics',
       u'precautions', u'pregnancy', u'pregnancy_or_breast_feeding',
       u'questions', u'recent_major_changes', u'residue_warning', u'risks',
       u'route', u'safe_handling_warning', u'spl_indexing_data_elements',
       u'spl_medguide', u'spl_patient_package_insert',
       u'statement_of_identity', u'summary_of_safety_and_effectiveness',
       u'teratogenic_effects', u'troubleshooting',
       u'use_in_specific_populations', u'user_safety_warnings', u'version',
       u'veterinary_indications', u'warnings_and_cautions', u'when_using',
       u'date', u'month', u'year']]

In [44]:
X_bool_only=df[[u'abuse', u'accessories', u'alarms',
       u'animal_pharmacology_and_or_toxicology', u'ask_doctor',
       u'ask_doctor_or_pharmacist', u'assembly_or_installation_instructions',
       u'boxed_warning', u'calibration_instructions',
       u'carcinogenesis_and_mutagenesis_and_impairment_of_fertility',
       u'cleaning', u'clinical_studies', u'components',
       u'controlled_substance', u'dependence', u'diagram_of_device',
       u'disposal_and_waste_handling', u'do_not_use',
       u'dosage_forms_and_strengths', u'drug_abuse_and_dependence',
       u'drug_and_or_laboratory_test_interactions', u'drug_interactions',
       u'environmental_warning', u'food_safety_warning',
       u'general_precautions', u'geriatric_use',
       u'guaranteed_analysis_of_feed', u'health_care_provider_letter',
       u'health_claim', u'information_for_owners_or_caregivers',
       u'information_for_patients', u'instructions_for_use',
       u'intended_use_of_the_device', u'labor_and_delivery',
       u'laboratory_tests', u'mechanism_of_action', u'microbiology',
       u'nonclinical_toxicology', u'nonteratogenic_effects',
       u'nursing_mothers', u'other_safety_information',
       u'patient_medication_information', u'pediatric_use',
       u'pharmacodynamics', u'pharmacogenomics', u'pharmacokinetics',
       u'precautions', u'pregnancy', u'pregnancy_or_breast_feeding',
       u'questions', u'recent_major_changes', u'residue_warning', u'risks',
       u'route', u'safe_handling_warning', u'spl_indexing_data_elements',
       u'spl_medguide', u'spl_patient_package_insert',
       u'statement_of_identity', u'summary_of_safety_and_effectiveness',
       u'teratogenic_effects', u'troubleshooting',
       u'use_in_specific_populations', u'user_safety_warnings', u'version',
       u'veterinary_indications', u'warnings_and_cautions', u'when_using']]

In [45]:
X_bool_only_predict=df_to_predict[[u'abuse', u'accessories', u'alarms',
       u'animal_pharmacology_and_or_toxicology', u'ask_doctor',
       u'ask_doctor_or_pharmacist', u'assembly_or_installation_instructions',
       u'boxed_warning', u'calibration_instructions',
       u'carcinogenesis_and_mutagenesis_and_impairment_of_fertility',
       u'cleaning', u'clinical_studies', u'components',
       u'controlled_substance', u'dependence', u'diagram_of_device',
       u'disposal_and_waste_handling', u'do_not_use',
       u'dosage_forms_and_strengths', u'drug_abuse_and_dependence',
       u'drug_and_or_laboratory_test_interactions', u'drug_interactions',
       u'environmental_warning', u'food_safety_warning',
       u'general_precautions', u'geriatric_use',
       u'guaranteed_analysis_of_feed', u'health_care_provider_letter',
       u'health_claim', u'information_for_owners_or_caregivers',
       u'information_for_patients', u'instructions_for_use',
       u'intended_use_of_the_device', u'labor_and_delivery',
       u'laboratory_tests', u'mechanism_of_action', u'microbiology',
       u'nonclinical_toxicology', u'nonteratogenic_effects',
       u'nursing_mothers', u'other_safety_information',
       u'patient_medication_information', u'pediatric_use',
       u'pharmacodynamics', u'pharmacogenomics', u'pharmacokinetics',
       u'precautions', u'pregnancy', u'pregnancy_or_breast_feeding',
       u'questions', u'recent_major_changes', u'residue_warning', u'risks',
       u'route', u'safe_handling_warning', u'spl_indexing_data_elements',
       u'spl_medguide', u'spl_patient_package_insert',
       u'statement_of_identity', u'summary_of_safety_and_effectiveness',
       u'teratogenic_effects', u'troubleshooting',
       u'use_in_specific_populations', u'user_safety_warnings', u'version',
       u'veterinary_indications', u'warnings_and_cautions', u'when_using']]

In [46]:
X_tsvd_1=tsvd_col_1
X_tsvd_2=tsvd_col_2
X_tsvd_3=tsvd_col_3
X_tsvd_4=tsvd_col_4
X_tsvd_5=tsvd_col_5
X_tsvd_6=tsvd_col_6
X_tsvd_7=tsvd_col_7
X_tsvd_8=tsvd_col_8
X_tsvd_9=tsvd_col_9

In [136]:
X_tsvd_1_60=tsvd_col_1_60
X_tsvd_1_70=tsvd_col_1_70

In [47]:
X_tsvd_1_plus_bool=pd.DataFrame(X_tsvd_1)
X_tsvd_1_plus_bool=X_tsvd_1_plus_bool.join(X_bool_only)

In [48]:
X_tsvd_9_plus_bool=pd.DataFrame(X_tsvd_9)
X_tsvd_9_plus_bool=X_tsvd_9_plus_bool.join(X_bool_only)

In [137]:
X_tsvd_1_60_plus_bool=pd.DataFrame(X_tsvd_1_60)
X_tsvd_1_60_plus_bool=X_tsvd_1_60_plus_bool.join(X_bool_only)

X_tsvd_1_70_plus_bool=pd.DataFrame(X_tsvd_1_70)
X_tsvd_1_70_plus_bool=X_tsvd_1_70_plus_bool.join(X_bool_only)

In [149]:
print len(X_tsvd_1_plus_bool)
print len(X_tsvd_1_70_plus_bool)

40876
40876


In [49]:
X_tsvd_1_predict=tsvd_col_1_predict
X_tsvd_2_predict=tsvd_col_2_predict
X_tsvd_3_predict=tsvd_col_3_predict
X_tsvd_4_predict=tsvd_col_4_predict
X_tsvd_5_predict=tsvd_col_5_predict
X_tsvd_6_predict=tsvd_col_6_predict
X_tsvd_7_predict=tsvd_col_7_predict
X_tsvd_8_predict=tsvd_col_8_predict
X_tsvd_9_predict=tsvd_col_9_predict

In [138]:
X_tsvd_1_60_predict=tsvd_col_1_60_predict
X_tsvd_1_70_predict=tsvd_col_1_70_predict

In [50]:
X_tsvd_1_plus_bool_predict=pd.DataFrame(X_tsvd_1_predict)
X_tsvd_1_plus_bool_predict=X_tsvd_1_plus_bool_predict.join(X_bool_only_predict)

In [139]:
X_tsvd_1_60_plus_bool_predict=pd.DataFrame(X_tsvd_1_60_predict)
X_tsvd_1_60_plus_bool_predict=X_tsvd_1_60_plus_bool_predict.join(X_bool_only_predict)

X_tsvd_1_70_plus_bool_predict=pd.DataFrame(X_tsvd_1_70_predict)
X_tsvd_1_70_plus_bool_predict=X_tsvd_1_70_plus_bool_predict.join(X_bool_only_predict)

In [51]:
X_tsvd_9_plus_bool_predict=pd.DataFrame(X_tsvd_9_predict)
X_tsvd_9_plus_bool_predict=X_tsvd_9_plus_bool_predict.join(X_bool_only_predict)

In [52]:
X_tsvd_1_and_9=pd.DataFrame(tsvd_col_1)
X_tsvd_1_and_9=X_tsvd_1_and_9.join(pd.DataFrame(tsvd_col_9),rsuffix='_9')

In [53]:
X_tsvd_1_and_9_predict=pd.DataFrame(tsvd_col_1_predict)
X_tsvd_1_and_9_predict=X_tsvd_1_and_9_predict.join(pd.DataFrame(tsvd_col_9_predict),rsuffix='_9')

In [54]:
X=X.join(pd.DataFrame(tsvd_cols))

In [55]:
X_nlp=tsvd_cols

In [56]:
X_predict=X_predict.join(pd.DataFrame(tsvd_cols_predict))

In [57]:
X_nlp_predict=tsvd_cols_predict

In [58]:
X.shape

(40876, 728)

In [59]:
y=df['target']

In [60]:
y_predict=df_to_predict['target']

In [61]:
# lr=LogisticRegression()

In [62]:
# lr.fit(X,y)

In [63]:
# lr.score(X,y)

In [64]:
# lr_bool=LogisticRegression()
# lr_bool.fit(X_bool_only,y)
# lr_bool.score(X_bool_only,y)

In [65]:
# lr.score(X_predict,y_predict)

In [66]:
# lr_bool.score(X_bool_only_predict,y_predict)

In [67]:
# prediction=lr.predict(X)

In [68]:
# prediction_predict=lr.predict(X_predict)

In [69]:
# prediction_bool=lr_bool.predict(X_bool_only)
# prediction_bool_predict=lr_bool.predict(X_bool_only_predict)
# print recall_score(y,prediction_bool)
# print recall_score(y_predict, prediction_bool_predict)
# print precision_score(y,prediction_bool)
# print precision_score(y_predict,prediction_bool_predict)

In [70]:
# print recall_score(y,prediction)

In [71]:
# print recall_score(y_predict,prediction_predict)

In [72]:
# print precision_score(y,prediction)

In [73]:
#print recall_score(y_predict,prediction_predict)

In [74]:
#y_predict.sum()

In [75]:
#prediction.shape

In [76]:
#prediction.sum()

In [77]:
21/43106.0

0.0004871711594673595

In [78]:
#random_guess=[]

In [79]:
# for num in range (len(prediction)):
#     random_guess.append(0)

In [80]:
# for num in range(prediction.sum()):
#     random_guess[num*100]=1

In [81]:
# random_recall=recall_score(y,random_guess)

In [82]:
# random_recall

In [83]:
df_final=X.join(y)
df_final=df_final.join(df['unq_ndc'])

In [84]:
df_final_predict=X_predict.join(y_predict)
df_final_predict=df_final_predict.join(df_to_predict['unq_ndc'])

In [85]:
df_final.to_csv('pca_v_2.csv')

In [86]:
df_final_predict.to_csv('pca_v_2_predicting_set.csv')

In [119]:
pd.DataFrame(X_tsvd_1).to_csv('col_1_train.csv')

In [120]:
pd.DataFrame(X_tsvd_1_predict).to_csv('col_1_test.csv')

In [151]:
len(X_tsvd_1_70_plus_bool)

40876

In [121]:
pd.DataFrame(X_tsvd_1_plus_bool).to_csv('col_1_bool_train.csv')

In [122]:
pd.DataFrame(X_tsvd_1_plus_bool_predict).to_csv('col_1_bool_test.csv')

In [145]:
pd.DataFrame(X_tsvd_1_70).to_csv('col_1_70_train.csv')
pd.DataFrame(X_tsvd_1_70_predict).to_csv('col_1_70_test.csv')

In [147]:
pd.DataFrame(X_tsvd_1_70_plus_bool).to_csv('col_1_70_plus_bool_train.csv')
pd.DataFrame(X_tsvd_1_70_plus_bool_predict).to_csv('col_1_70_plus_bool_test.csv')

In [118]:
len(X_tsvd_9_plus_bool)

40876

In [91]:
pd.DataFrame(X_tsvd_9).to_csv('col_9_train.csv')
pd.DataFrame(X_tsvd_9_predict).to_csv('col_9_test.csv')
pd.DataFrame(X_tsvd_9_plus_bool).to_csv('col_9_bool_train.csv')
pd.DataFrame(X_tsvd_9_plus_bool_predict).to_csv('col_9_bool_test.csv')

In [92]:
rfc=RandomForestClassifier(n_estimators=300,n_jobs=-1,class_weight='balanced')

In [146]:
rfc.fit(X_tsvd_1_70_plus_bool,y)
prediction= rfc.predict(X_tsvd_1_70_plus_bool)
prediction_predict=rfc.predict(X_tsvd_1_70_plus_bool_predict)
print recall_score(y,prediction)
print recall_score(y_predict,prediction_predict)
print precision_score(y,prediction)
print precision_score(y_predict,prediction_predict)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [141]:
print y.sum()
print prediction.sum()
print y_predict.sum()
print prediction_predict.sum()

132
253
150
38


In [95]:
cm=pd.DataFrame(confusion_matrix(y_predict,prediction_predict), columns=['Pred. 0','Pred. 1'],index=['Actual 0','Actual 1'])

In [96]:
print cm

          Pred. 0  Pred. 1
Actual 0    43757       45
Actual 1      147        3


In [97]:
print classification_report(y_predict,prediction_predict)

             precision    recall  f1-score   support

          0       1.00      1.00      1.00     43802
          1       0.06      0.02      0.03       150

avg / total       0.99      1.00      0.99     43952



In [98]:
findings_tsvd_1=df_to_predict.join(pd.DataFrame(prediction_predict,columns=['prediction']))
findings_tsvd_1[(findings_tsvd_1['prediction']==1)&(findings_tsvd_1['target']==1)]
#print findings_tsvd_1.head()

Unnamed: 0.1,Unnamed: 0,adverse_reactions,clinical_pharmacology,contraindications,description,dosage_and_administration,how_supplied,indications_and_usage,overdosage,spl_product_data_elements,...,version,veterinary_indications,warnings_and_cautions,when_using,date,month,year,unq_ndc,target,prediction
29779,69205,u advers reaction advers effect parenter admin...,u clinic pharmacolog magnesium import cofactor...,u contraind parenter administr drug contraind ...,u descript magnesium sulfat inject usp 50 ster...,u dosag administr dosag magnesium sulfat must ...,u suppli magnesium sulfat inject usp suppli si...,u indic usag magnesium sulfat inject usp suita...,u overdosag magnesium intox manifest sharp dro...,u magnesium sulfat magnesium sulfat heptahydr ...,...,4,0,0,0,10,11,2016,0409-2168,1,1
37593,78662,u advers reaction import advers clinic event c...,u clinic pharmacolog introduct follow parenter...,u contraind fosphenytoin sodium inject contrai...,u descript fosphenytoin sodium inject usp prod...,u dosag administr dose concentr infus rate fos...,u suppli fosphenytoin sodium inject usp suppli...,u indic usag fosphenytoin sodium inject indic ...,u overdosag nausea vomit lethargi tachycardia ...,u fosphenytoin fosphenytoin sodium fosphenytoi...,...,7,0,0,0,8,12,2015,63323-403,1,1
37594,78663,u advers reaction import advers clinic event c...,u clinic pharmacolog introduct follow parenter...,u contraind fosphenytoin sodium inject contrai...,u descript fosphenytoin sodium inject usp prod...,u dosag administr dose concentr infus rate fos...,u suppli fosphenytoin sodium inject usp suppli...,u indic usag fosphenytoin sodium inject indic ...,u overdosag nausea vomit lethargi tachycardia ...,u fosphenytoin fosphenytoin sodium fosphenytoi...,...,7,0,0,0,8,12,2015,63323-403,1,1


In [99]:
rfc.fit(X_tsvd_2,y)
prediction= rfc.predict(X_tsvd_2)
prediction_predict=rfc.predict(X_tsvd_2_predict)
print recall_score(y,prediction)
print recall_score(y_predict,prediction_predict)
print precision_score(y,prediction)
print precision_score(y_predict,prediction_predict)

0.636363636364
0.0
0.401913875598
0.0


  'precision', 'predicted', average, warn_for)


In [100]:
rfc.fit(X_tsvd_3,y)
prediction= rfc.predict(X_tsvd_3)
prediction_predict=rfc.predict(X_tsvd_3_predict)
print recall_score(y,prediction)
print recall_score(y_predict,prediction_predict)
print precision_score(y,prediction)
print precision_score(y_predict,prediction_predict)

0.636363636364
0.0
0.195804195804
0.0


In [101]:
rfc.fit(X_tsvd_4,y)
prediction= rfc.predict(X_tsvd_4)
prediction_predict=rfc.predict(X_tsvd_4_predict)
print recall_score(y,prediction)
print recall_score(y_predict,prediction_predict)
print precision_score(y,prediction)
print precision_score(y_predict,prediction_predict)

0.659090909091
0.0
0.769911504425
0.0


In [102]:
rfc.fit(X_tsvd_5,y)
prediction= rfc.predict(X_tsvd_5)
prediction_predict=rfc.predict(X_tsvd_5_predict)
print recall_score(y,prediction)
print recall_score(y_predict,prediction_predict)
print precision_score(y,prediction)
print precision_score(y_predict,prediction_predict)

1.0
0.0
0.0402561756633
0.0


In [103]:
rfc.fit(X_tsvd_6,y)
prediction= rfc.predict(X_tsvd_6)
prediction_predict=rfc.predict(X_tsvd_6_predict)
print recall_score(y,prediction)
print recall_score(y_predict,prediction_predict)
print precision_score(y,prediction)
print precision_score(y_predict,prediction_predict)

0.659090909091
0.0
0.861386138614
0.0


In [104]:
rfc.fit(X_tsvd_7,y)
prediction= rfc.predict(X_tsvd_7)
prediction_predict=rfc.predict(X_tsvd_7_predict)
print recall_score(y,prediction)
print recall_score(y_predict,prediction_predict)
print precision_score(y,prediction)
print precision_score(y_predict,prediction_predict)

0.992424242424
0.0
0.0426571149463
0.0


In [105]:
rfc.fit(X_tsvd_8,y)
prediction= rfc.predict(X_tsvd_8)
prediction_predict=rfc.predict(X_tsvd_8_predict)
print recall_score(y,prediction)
print recall_score(y_predict,prediction_predict)
print precision_score(y,prediction)
print precision_score(y_predict,prediction_predict)

0.545454545455
0.0
0.184143222506
0.0


In [106]:
rfc.fit(X_tsvd_9,y)
prediction= rfc.predict(X_tsvd_9)
prediction_predict=rfc.predict(X_tsvd_9_predict)
print recall_score(y,prediction)
print recall_score(y_predict,prediction_predict)
print precision_score(y,prediction)
print precision_score(y_predict,prediction_predict)

1.0
0.00666666666667
0.317307692308
0.0120481927711


In [107]:
print y.sum()
print prediction.sum()
print y_predict.sum()
print prediction_predict.sum()

132
416
150
83


In [108]:
rfc.fit(X_tsvd_1_and_9,y)
prediction= rfc.predict(X_tsvd_1_and_9)
prediction_predict=rfc.predict(X_tsvd_1_and_9_predict)
print recall_score(y,prediction)
print recall_score(y_predict,prediction_predict)
print precision_score(y,prediction)
print precision_score(y_predict,prediction_predict)

1.0
0.0
0.543209876543
0.0


In [109]:
print y.sum()
print prediction.sum()
print y_predict.sum()
print prediction_predict.sum()

132
243
150
0


In [110]:
rfc.fit(X_tsvd_1_plus_bool,y)
prediction= rfc.predict(X_tsvd_1_plus_bool)
prediction_predict=rfc.predict(X_tsvd_1_plus_bool_predict)
print recall_score(y,prediction)
print recall_score(y_predict,prediction_predict)
print precision_score(y,prediction)
print precision_score(y_predict,prediction_predict)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [None]:
print y.sum()
print prediction.sum()
print y_predict.sum()
print prediction_predict.sum()

In [None]:
cm=confusion_matrix(y_predict,prediction_predict)

In [None]:
print cm

In [None]:
print classification_report(y_predict,prediction_predict)

In [None]:
findings_tsvd_bool=df_to_predict.join(pd.DataFrame(prediction_predict,columns=['prediction']))
findings_tsvd_bool[(findings_tsvd_bool['prediction']==1)&(findings_tsvd_1['target']==1)]
#print findings_tsvd_1.head()

In [None]:
rfc.fit(X_tsvd_9_plus_bool,y)
prediction= rfc.predict(X_tsvd_9_plus_bool)
prediction_predict=rfc.predict(X_tsvd_9_plus_bool_predict)
print recall_score(y,prediction)
print recall_score(y_predict,prediction_predict)
print precision_score(y,prediction)
print precision_score(y_predict,prediction_predict)

In [None]:
print y.sum()
print prediction.sum()
print y_predict.sum()
print prediction_predict.sum()

In [None]:
findings_tsvd_1

In [None]:
4/42.0

In [None]:
12/2924.0