In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

model_dict = {}
val_dict = {}
quest_val_dict = {}
quest_model_dict = {}

question_data = pd.read_csv('question_info.txt', sep="\t",header=None)
question_data.columns = ["Q_id","Q_tag","Word_seq","Char_seq","Likes","Ans","Top_Ans"]
del question_data["Word_seq"]

In [2]:
from __future__ import print_function

from sklearn import datasets
from sklearn.cross_validation  import train_test_split
from sklearn.grid_search  import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC

def get_Best_Param(X, y):
    X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0,stratify = y)

# Set the parameters by cross-validation
    tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
  
    clf = GridSearchCV(SVC(C=1), tuned_parameters)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print(clf.best_params_)
    

In [3]:
cv = CountVectorizer(tokenizer=lambda x: x.split("/"))
q_tag_vector = cv.fit_transform(question_data["Q_tag"].astype(str))
char_seq_vector = cv.fit_transform(question_data["Char_seq"].astype(str))
del question_data["Q_tag"]
del question_data["Char_seq"]

In [4]:

char_col_list=["char_"+str(num) for num in range(char_seq_vector.shape[1])]
tag_col_list=["tag_"+str(num) for num in range(q_tag_vector.shape[1])]
question_data =  pd.concat([question_data,pd.DataFrame(q_tag_vector.toarray(), columns=tag_col_list)],axis=1)
#question_data =  pd.concat([question_data,pd.DataFrame(char_seq_vector.toarray(), columns=char_col_list)],axis=1)

In [5]:
expert_data = pd.read_csv('user_info.txt', sep="\t",header=None)
expert_data.columns = ["U_id","U_tag","Word_seq","Char_seq"]
del expert_data["Word_seq"]
U_tag_vector = cv.fit_transform(expert_data["U_tag"].astype(str))
char_seq_vector = cv.fit_transform(expert_data["Char_seq"].astype(str))
del expert_data["U_tag"]
del expert_data["Char_seq"]


In [6]:
char_col_list=["char_"+str(num) for num in range(char_seq_vector.shape[1])]
tag_col_list=["U_tag_"+str(num) for num in range(U_tag_vector.shape[1])]
expert_data =  pd.concat([expert_data,pd.DataFrame(U_tag_vector.toarray(), columns=tag_col_list)],axis=1)
expert_data =  pd.concat([expert_data,pd.DataFrame(char_seq_vector.toarray(), columns=char_col_list)],axis=1)

In [7]:
expert_question_dist = pd.read_csv('invited_info_train.txt', sep="\t",header=None)
expert_question_dist.columns = ["Q_id","U_id","output"]

In [9]:
for expert,per_user_dist in expert_question_dist.groupby(["U_id"]):
    final_data_user = pd.merge(question_data, per_user_dist, on='Q_id',how = 'inner')
#final_data = pd.merge(final_data, expert_data, on='U_id', how='inner')
    del final_data_user["Q_id"]
    del final_data_user["U_id"]
    final_target_user = final_data_user["output"]
    if len(final_data_user.output.unique()) == 1: 
        val_dict[expert] = final_data_user.output.unique() 
        del final_data_user["output"]
    else:
        del final_data_user["output"]
        '''if final_data.shape[0] > 4 and final_target.sum() > 1:
            print("finding best param")
            print(final_target)
            get_Best_Param(final_data,final_target)'''
        model = SVC(probability=True)
        model.fit(final_data_user, final_target_user)
        model_dict[expert] = model

In [43]:
for question,per_user_dist in expert_question_dist.groupby(["Q_id"]):
    final_data_quest = pd.merge(expert_data, per_user_dist, on='U_id',how = 'inner')
#final_data = pd.merge(final_data, expert_data, on='U_id', how='inner')
    del final_data_quest["Q_id"]
    del final_data_quest["U_id"]
    final_target_quest = final_data_quest["output"]
    if len(final_data_quest.output.unique()) == 1: 
        quest_val_dict[question] = final_data_quest.output.unique() 
        del final_data_quest["output"]
    else:
        del final_data_quest["output"]
        '''if final_data.shape[0] > 4 and final_target.sum() > 1:
            print("finding best param")
            print(final_target)
            get_Best_Param(final_data,final_target)'''
        model = SVC(probability=True)
        model.fit(final_data_quest, final_target_quest)
        quest_model_dict[question] = model

In [10]:
validation_data = pd.read_csv('validate_nolabel.txt', sep=",")[[0,1]]
validation_data.columns = ["Q_id","U_id"]

In [12]:
for expert,per_user_dist in validation_data.groupby(["U_id"]):
    per_user_dist.reset_index(drop=True, inplace=True)
    predicted_data = pd.merge(per_user_dist,question_data, on='Q_id', how='inner')
    if expert in val_dict.keys():
        output =  np.empty([1,per_user_dist.shape[0]])
        output.fill(val_dict[expert][0])  
        outputdf = pd.DataFrame(output.T)
        final_predicted_data = pd.concat([per_user_dist,outputdf],axis=1)
        final_predicted_data.to_csv('val.csv',mode='a',sep=',', index=False, header = False)
    elif expert in model_dict.keys():
        del predicted_data["Q_id"]
        del predicted_data["U_id"] 
        predicted = model_dict[expert].predict_proba(predicted_data)
        outputdf = pd.DataFrame(predicted)[1]
        outputdf.columns = ["output"]
        final_predicted_data = pd.concat([per_user_dist,outputdf],axis=1)
        final_predicted_data.to_csv('val.csv',mode='a',sep=',', index=False, header = False)
    else:
        output =  np.empty([1,per_user_dist.shape[0]])
        output.fill(0)  
        outputdf = pd.DataFrame(output.T)
        final_predicted_data = pd.concat([per_user_dist,outputdf],axis=1)
        '''for question,per_quest_dist in per_user_dist.groupby(["Q_id"]):
            per_quest_dist.reset_index(drop=True, inplace=True)
            predicted_data = pd.merge(per_quest_dist,expert_data, on='U_id', how='inner')
            if question in quest_val_dict.keys():
                output =  np.empty([1,per_quest_dist.shape[0]])
                output.fill(quest_val_dict[question][0])  
                outputdf = pd.DataFrame(output.T)
                final_predicted_data = pd.concat([per_quest_dist,outputdf],axis=1)
                final_predicted_data.to_csv('val1.csv',mode='a',sep=',', index=False, header = False)
            elif question in quest_model_dict.keys():
                del predicted_data["Q_id"]
                del predicted_data["U_id"] 
                predicted = quest_model_dict[question].predict_proba(predicted_data)
                outputdf = pd.DataFrame(predicted)[1]
                outputdf.columns = ["output"]
                final_predicted_data = pd.concat([per_quest_dist,outputdf],axis=1)
    # expert not in distribution (ie) he has not ranked anything
                final_predicted_data.to_csv('val1.csv',mode='a',sep=',', index=False, header = False)
            else:
                output =  np.empty([1,per_quest_dist.shape[0]])
                output.fill(0)  
                outputdf = pd.DataFrame(output.T)
                final_predicted_data = pd.concat([per_quest_dist,outputdf],axis=1)'''
        final_predicted_data.to_csv('val.csv',mode='a',sep=',', index=False, header = False)
                

IOError: [Errno 13] Permission denied: 'val.csv'