In [2]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from __future__ import print_function
from sklearn import datasets
from sklearn.cross_validation  import train_test_split
from sklearn.grid_search  import GridSearchCV
from sklearn.metrics import classification_report

def readData(filename,columns):
    data = pd.read_csv(filename, sep="\t",header=None)
    data.columns = columns
    return data

def expandCol(data,column_name):
    cv = CountVectorizer(tokenizer=lambda x: x.split("/"))
    return cv.fit_transform(data[column_name].astype(str))

def appendExpandedCol(data , vector_to_expand):
    for key in vector_to_expand.keys():
        col_list=[key+str(num) for num in range(vector_to_expand[key].shape[1])]
    data =  pd.concat([data,pd.DataFrame(vector_to_expand[key].toarray(), columns=col_list)],axis=1)
    return data

def getQuestData():
    vector_to_expand = {}    
    question_data = readData('question_info.txt',["Q_id","Q_tag","Word_seq","Q_Char_seq","Likes","Ans","Top_Ans"])
    del question_data["Word_seq"]
    vector_to_expand["Q_tag"] = expandCol(question_data,"Q_tag")
    vector_to_expand["Q_Char_seq"] = expandCol(question_data,"Q_Char_seq")
    del question_data["Q_tag"]
    del question_data["Q_Char_seq"]
    return appendExpandedCol(question_data , vector_to_expand)

def getExpData():
    vector_to_expand = {}    
    expert_data = readData('user_info.txt',["U_id","U_tag","U_Word_seq","U_Char_seq"])
    del expert_data["U_Word_seq"]
    vector_to_expand["U_tag"] = expandCol(expert_data,"U_tag")
    vector_to_expand["U_Char_seq"] = expandCol(expert_data,"U_Char_seq")
    del expert_data["U_tag"]
    del expert_data["U_Char_seq"]
    return appendExpandedCol(expert_data , vector_to_expand)

def SVCModel(data,combine_col,mergecol,expert_question_dist):
    val_dict ={}
    model_dict = {}
    for id,per_dist in expert_question_dist.groupby([combine_col]):
        final_data = pd.merge(data, per_dist, on=mergecol,how = 'inner')
        #final_data = pd.merge(final_data, expert_data, on='U_id', how='inner')
        del final_data[combine_col]
        del final_data[mergecol]
        final_target_user = final_data["output"]
        if len(final_data.output.unique()) == 1: 
            val_dict[id] = final_data.output.unique() 
            del final_data["output"]
        else:
            del final_data["output"]
            model = SVC(probability=True)
            model.fit(final_data, final_target_user)
            model_dict[id] = model
    return model_dict,val_dict

   
question_data = getQuestData()
expert_data = getExpData()
expert_question_dist = pd.read_csv('invited_info_train.txt', sep="\t",header=None)
expert_question_dist.columns = ["Q_id","U_id","output"]
e_model_dict,e_val_dict = SVCModel(question_data,"U_id","Q_id",expert_question_dist)
q_model_dict,q_val_dict = SVCModel(expert_data,"Q_id","U_id",expert_question_dist)
validation_data = pd.read_csv('validate_nolabel.txt', sep=",")[[0,1]]
validation_data.columns = ["Q_id","U_id"]


In [2]:
def get_Best_Param(X, y):
    X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0,stratify = y)

    tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
  
    clf = GridSearchCV(SVC(C=1), tuned_parameters)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print(clf.best_params_)
    

In [4]:
def SVCPredict(combinecol,mergecol,data,validation_data,val_dict,model_dict):
    final_predicted_data = pd.DataFrame()
    for id,per_dist in validation_data.groupby(combinecol):
        per_dist.reset_index(drop=True, inplace=True)
        predicted_data = pd.merge(per_dist,data, on=mergecol, how='inner')
        if id in val_dict.keys():
            output =  np.empty([1,per_dist.shape[0]])
            output.fill(val_dict[id][0])  
            outputdf = pd.DataFrame(output.T)
            outputdf.columns = ["output"]
            final_predicted_data=final_predicted_data.append(pd.concat([per_dist,outputdf],axis=1),ignore_index=True)
        elif id in model_dict.keys():
            del predicted_data["Q_id"]
            del predicted_data["U_id"] 
            predicted = model_dict[id].predict_proba(predicted_data)
            outputdf = pd.DataFrame(pd.DataFrame(predicted)[1])
            outputdf.columns = ["output"]
            final_predicted_data=final_predicted_data.append(pd.concat([per_dist,outputdf],axis=1),ignore_index=True)
        else:
            output =  np.empty([1,per_dist.shape[0]])
            output.fill(0)  
            outputdf = pd.DataFrame(output.T)
            outputdf.columns = ["output"]
            final_predicted_data=final_predicted_data.append(pd.concat([per_dist,outputdf],axis=1),ignore_index=True)
    return final_predicted_data

final_predicted_data = SVCPredict("U_id","Q_id",question_data,validation_data,e_val_dict,e_model_dict)
#final_predicted_data = SVCPredict("Q_id","U_id",expert_data,validation_data,q_val_dict,q_model_dict)
final_predicted_data.to_csv('val.csv',sep=',', index=False, header = False)