In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from scipy.sparse import hstack
import csv

In [2]:
# 0-positive, 1-negative, 2-neutral
df = pd.read_csv("inno/train.csv")
x = df[["text", "drug"]]
y = df["sentiment"]

In [3]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [4]:
training_phrase = x_train
training_label = y_train
testing_phrase = x_test
testing_label = y_test

In [5]:
tfidf = TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(1,1))
features = tfidf.fit_transform(np.array(training_phrase["text"])) #fit transform to learn vocubulary and transform in matrix
features_test = tfidf.transform(np.array(testing_phrase["text"])) #uses vocab from fit and transform to matrix
features1 = tfidf.transform(np.array(training_phrase["drug"]))
features_test1 = tfidf.transform(np.array(testing_phrase["drug"]))

In [11]:
df_tfidf = pd.DataFrame(features.todense(), columns = tfidf.get_feature_names())

In [6]:
regressor = LogisticRegression()
fin_features = hstack([features, features1])
regressor.fit(fin_features, np.array(training_label))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [7]:
fin_feat_test = hstack([features_test, features_test1])
regressor.score(fin_feat_test, testing_label)

0.7309236947791165

In [8]:
regressor.predict(fin_feat_test)

array([2, 2, 2, ..., 2, 2, 2])

In [10]:
testing_label.value_counts()

2    1262
1     291
0     190
Name: sentiment, dtype: int64

In [11]:
y.value_counts()

2    3825
1     837
0     617
Name: sentiment, dtype: int64

In [32]:
features

<3536x17765 sparse matrix of type '<class 'numpy.float64'>'
	with 472133 stored elements in Compressed Sparse Row format>

In [33]:
features1

<3536x17765 sparse matrix of type '<class 'numpy.float64'>'
	with 3558 stored elements in Compressed Sparse Row format>

In [34]:

hstack([features, features1])

<3536x35530 sparse matrix of type '<class 'numpy.float64'>'
	with 475691 stored elements in COOrdinate format>

In [28]:
features+features1

ValueError: inconsistent shapes

In [12]:
df1 = pd.read_csv("inno/test.csv")

In [42]:
df1.head()

Unnamed: 0,unique_hash,text,drug
0,9e9a8166b84114aca147bf409f6f956635034c08,"256 (previously stable on natalizumab), with 5...",fingolimod
1,e747e6822c867571afe7b907b51f0f2ca67b0e1a,On fingolimod and have been since December 201...,fingolimod
2,50b6d851bcff4f35afe354937949e9948975adf7,Apparently it's shingles! :-/ I do have a few ...,humira
3,7f82ec2176ae6ab0b5d20b5ffc767ac829f384ae,If the Docetaxel doing once a week x3 weeks th...,tagrisso
4,8b37d169dee5bdae27060949242fb54feb6a7f7f,"CC, Stelara worked in a matter of days for me....",stelara


In [13]:
df1

Unnamed: 0,unique_hash,text,drug
0,9e9a8166b84114aca147bf409f6f956635034c08,"256 (previously stable on natalizumab), with 5...",fingolimod
1,e747e6822c867571afe7b907b51f0f2ca67b0e1a,On fingolimod and have been since December 201...,fingolimod
2,50b6d851bcff4f35afe354937949e9948975adf7,Apparently it's shingles! :-/ I do have a few ...,humira
3,7f82ec2176ae6ab0b5d20b5ffc767ac829f384ae,If the Docetaxel doing once a week x3 weeks th...,tagrisso
4,8b37d169dee5bdae27060949242fb54feb6a7f7f,"CC, Stelara worked in a matter of days for me....",stelara
5,b1950d27d94ceff4e9bf8c7d1fd4b11b35ede4d7,"Janssen Biotech, Inc. has just received FDA ap...",stelara
6,abafc5b6c5aac6f777cf265e5c7dd80fb793e6bc,"I just had the, ” I thought things would be be...",ocrevus
7,e5550693e72a8335d723ca5fc64da91e1256fb0b,Dec.26 2018 (Basha Fowler) I was diagnosed in ...,tagrisso
8,ee8c500f6402331ff12b0b29d943b6d1699a0b8d,"Hi, I started Gilenya about 7 weeks ago and ha...",gilenya
9,d261600ba4fc022fac12748845deed56822ff195,My uncle is still going through treatment. 2 k...,keytruda


In [77]:
fin_data = [["unique_hash", "sentiment"]]
for index, row in df1.iterrows():
    temp = tfidf.transform([row["text"]])
    temp1 = tfidf.transform([row["drug"]])
    temp_fin = hstack([temp, temp1])
    if regressor.predict(hstack([testxc, testyc]))[0] != 2:
        print("hello",index)
    fin_data.append([row["unique_hash"], regressor.predict(hstack([testxc, testyc]))[0]])

In [73]:
testxc = tfidf.transform([df1.iloc[41]["text"]])
testyc = tfidf.transform([df1.iloc[41]["drug"]])

In [74]:
regressor.predict(hstack([testxc, testyc]))[0]

2

In [76]:
fin_data

[['unique_hash', 'sentiment'],
 ['9e9a8166b84114aca147bf409f6f956635034c08', 2],
 ['e747e6822c867571afe7b907b51f0f2ca67b0e1a', 2],
 ['50b6d851bcff4f35afe354937949e9948975adf7', 2],
 ['7f82ec2176ae6ab0b5d20b5ffc767ac829f384ae', 2],
 ['8b37d169dee5bdae27060949242fb54feb6a7f7f', 2],
 ['b1950d27d94ceff4e9bf8c7d1fd4b11b35ede4d7', 2],
 ['abafc5b6c5aac6f777cf265e5c7dd80fb793e6bc', 2],
 ['e5550693e72a8335d723ca5fc64da91e1256fb0b', 2],
 ['ee8c500f6402331ff12b0b29d943b6d1699a0b8d', 2],
 ['d261600ba4fc022fac12748845deed56822ff195', 2],
 ['ed1836e3e3597568a9b3d58dcfe8ed7143bba4bd', 2],
 ['4b63c92b430a34abdc659a2ceee6fb353a759502', 2],
 ['c046fc70caab19d74271855afa4b0fce56eed65d', 2],
 ['7e6cd0d963891627f809b8b3430817dc08a4671e', 2],
 ['c1db106d5edc8db4a139acd3532e137903b88dfe', 2],
 ['58bb0752dd7b4835195f27c975bf4ec27dbf5d06', 2],
 ['17158941d16c145b692b65acdc0098d7cde98fd1', 2],
 ['245536722a649761b30a740031040a937d8b3e0e', 2],
 ['a62ee194c75afeb7527adc2ac3fc1bf4861b818d', 2],
 ['3b792ab6f0d8cbd6

In [78]:
with open('inno/submission.csv', 'w') as csvFile:
    writer = csv.writer(csvFile)
    writer.writerows(fin_data)
csvFile.close()