In [None]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

def readData(filename,columns):
    data = pd.read_csv(filename, sep="\t",header=None)
    data.columns = columns
    return data

def expandCol(data,column_name):
    cv = CountVectorizer(tokenizer=lambda x: x.split("/"))
    return cv.fit_transform(data[column_name].astype(str))

def appendExpandedCol(data , vector_to_expand):
    for key in vector_to_expand.keys():
        col_list=[key+str(num) for num in range(vector_to_expand[key].shape[1])]
        data =  pd.concat([data,pd.DataFrame(vector_to_expand[key].toarray(), columns=col_list)],axis=1)
    return data

def getQuestData():
    vector_to_expand = {}    
    question_data = readData('question_info.txt',["Q_id","Q_tag","Word_seq","Q_Char_seq","Likes","Ans","Top_Ans"])
    del question_data["Word_seq"]
    vector_to_expand["Q_tag"] = expandCol(question_data,"Q_tag")
    vector_to_expand["Q_Char_seq"] = expandCol(question_data,"Q_Char_seq")
    del question_data["Q_tag"]
    del question_data["Q_Char_seq"]
    return appendExpandedCol(question_data , vector_to_expand)

def getExpData():
    vector_to_expand = {}    
    expert_data = readData('user_info.txt',["U_id","U_tag","U_Word_seq","U_Char_seq"])
    del expert_data["U_Word_seq"]
    vector_to_expand["U_tag"] = expandCol(expert_data,"U_tag")
    vector_to_expand["U_Char_seq"] = expandCol(expert_data,"U_Char_seq")
    del expert_data["U_tag"]
    del expert_data["U_Char_seq"]
    return appendExpandedCol(expert_data , vector_to_expand)

question_data = getQuestData()
expert_data = getExpData()
expert_question_dist = pd.read_csv('invited_info_train.txt', sep="\t",header=None)
expert_question_dist.columns = ["Q_id","U_id","output"]
final_data = pd.merge(question_data, expert_question_dist, on="Q_id",how = 'inner')
final_data = pd.merge(final_data, expert_data, on='U_id', how='inner')
final_target = final_data["output"]
del final_data["output"]
model = RandomForestClassifier(n_estimators=3)
model.fit(scipy.sparse.csr_matrix(final_data.values), np.squeeze(np.asarray(final_target)))
validation_data = pd.read_csv('validate_nolabel.txt', sep=",")[[0,1]]
validation_data.columns = ["Q_id","U_id"]
final_data = pd.merge(question_data, validation_data, on="Q_id",how = 'inner')
final_data = pd.merge(final_data, expert_data, on='U_id', how='inner')
del final_data["Q_id"]
del final_data["U_id"]
predicted = model.predict_proba(scipy.sparse.csr_matrix(final_data.values))
outputdf = pd.DataFrame(pd.DataFrame(predicted)[1])
outputdf.columns = ["output"]
final_predicted_data = pd.concat([validation_data,outputdf],axis=1)
final_predicted_data.to_csv('val.csv',sep=',', index=False, header = False)

<bound method DataFrame.count of                                    Q_id                              U_id
0      726c3abc84a99da48e27c2d8f19c2bc8  e85d05b6796c351e6a83cfc85309b023
1      3737dc50f0f4b1a3df77b96cbf9ce928  e13419f565027f9c64bdab889acd1cbe
2      a2adc7a16a129c2326706238e186cd11  b19ebc32f3d1de37e12792edc149993f
3      56fadfe2695e9ce7c1e50d5f0fa705fb  ac0efd36e6f6eca572f448d6020dce89
4      cf06c69d63dcab40a6f0b35d594d42b7  6aa377580a87ab45bc40da0cf220e530
5      23f707b3c605454f7a095922777c2a58  74c3f0c758dd8c3ae3a9f486346f0c56
6      40ed7a89f51921e410d278658b6563ac  f33d4e894c2229000114eab3a3fd8667
7      239ba3e9cfd9b2582e033b0446f7b6ce  eb597fb9e6c92106ad976a2bfc33f11a
8      9c6ed35f3cfd9ce2bf8cc0277677772a  6f2645a07e0cb5a71be55d8d0aca535f
9      406eb8a04629fe6e453e793b5368faa8  ad5b4b01889ed750bc1de041409a889d
10     0394717346380f37af9f332a6e0b711d  58f771da2d34ca41fcb73abef11d8f1e
11     a35c0190ef079f7a60c8dfc285cf26f1  2c5cfd88c429376ba81e24d63b994a27
12   