In [0]:
import pandas as pd
import numpy as np
import time
import math
import matplotlib.pyplot as plt

def set_med_mad(quant_q):
    # median_time = est_time and mad_time = 0.25 * median_time.
    
    median_time = quant_q.time.values
    mad_time = median_time * 0.25
    median_time = median_time.tolist()
    mad_time = mad_time.tolist()
    
    quant_q['median_time'] = pd.Series(median_time, index = quant_q.index)
    quant_q['mad_time'] = pd.Series(mad_time, index = quant_q.index)    
    return quant_q

def dotproduct(a,b,c):
    n = len(a)
    sum = 0
    for i in range(n):
        sum=sum+a[i]*b[i]*c[i]
    return sum
    
def check_if_correct(inter,quant_q):
    # Given a row of data from the interaction table, will check the question data and return whether the
    # attempt was correct or not (success_status is 1 or 0)
    i = 0
    n = len(quant_q)
    success_status = 0
    found = 0
    if inter[5]=="entrayn_glu":
        while i < n:
            if quant_q.entrayn_glu_id[i]==inter[4]:
                found = 1
                difficulty = quant_q.difficulty[i]
                l1_tag = quant_q.L1[i]
                l2_tag = quant_q.L2[i]
                est_time = quant_q.time[i]
                median_time = quant_q.median_time[i]
                mad_time = quant_q.mad_time[i]
                timestamp = inter[6]
                if (quant_q.answer[i])==inter[7]:
                    success_status = 1
                    
            i = i + 1
        if found == 0:
            return False,False,False,False,False,False,False,False
    return success_status,difficulty,l1_tag,l2_tag,est_time,median_time,mad_time,timestamp

def attempt_hist_user(int_data,quant_q,reattempt_data,uid):
    # Given the entire interation table and quant question table, for a given user id, this function will output 
    # a table with the following columns: interaction_id, gluid, success_status, timestamp, estimated time, 
    # L1 tag, L2 tag, difficulty level, time taken on the question, attempt number
    
    success_status = []
    int_id = []
    gluid = []
    time_taken = []
    test_time = []
    difficulty = []
    l1_tag = []
    l2_tag = []
    weighted_score = []
    median_time = []
    mad_time = []
    time_factor = []
    timestamp = []
    attempt_factor=[]
    i = 0
    n = len(int_data)
    set_of_quant_q = set(quant_q.entrayn_glu_id.tolist())
    
    while i<n:
        if int_data.entity_id[i] in set_of_quant_q:
            m = reattempt_data.loc[reattempt_data.uid == uid]
            attempts=(m.loc[m.q_id==int_data.entity_id[i]].no_of_attempts.tolist())
            if attempts:
                    if attempts[0]==1 :
                        N=1
                    if attempts[0]==2 :
                        N=0.75
                    if attempts[0]==3 :
                        N=0.5
                    if attempts[0]==4 :
                        N=0.25
                    if attempts[0]>=5 :
                        N=0.1
        
        
        
        if int_data.uid[i] == uid:
            if int_data.type[i] == "navigation_next":
                # computing values to construct table
                a,b,c,d,e,f,g,h = check_if_correct(int_data.loc[i].values,quant_q)
                # Assigning time factor
                if int_data.value2[i] <= (f + g):
                        t = 1
                if int_data.value2[i] > (f + g) and int_data.value2[i] <= (f + 2*g):
                        t = 0.9
                if int_data.value2[i] > (f + 2*g) and int_data.value2[i] <= (f + 4*g):
                        t = 0.8
                if int_data.value2[i] > (f + 4*g):
                        t = 0.7
                if math.isnan(int_data.value2[i]) == True:
                        t = 0
                
                # Assigning weighted_score. (Difficulty is shifted from 0 - 4 to 1 - 5 scale)    
                
                w=a*(b+1)     
                if a is not False:
                    int_id.append(int_data.interaction_id[i])
                    gluid.append(int_data.entity_id[i])
                    time_taken.append(int_data.value2[i])
                    success_status.append(a)
                    difficulty.append(b + 1)
                    l1_tag.append(c)
                    l2_tag.append(d)
                    test_time.append(e)
                    median_time.append(f)
                    mad_time.append(g)
                    timestamp.append(h)
                    time_factor.append(t)
                    attempt_factor.append(N)
                    weighted_score.append(w)
            
                    
           
        i=i+1
                      
    hist_user = pd.DataFrame({'grade':pd.Series(success_status),
                              'int_id':pd.Series(int_id),
                              'gluid':pd.Series(gluid),
                              'time_taken':pd.Series(time_taken),
                              'difficulty':pd.Series(difficulty),
                              'test_time':pd.Series(test_time),
                              'l1_tag':pd.Series(l1_tag),
                              'l2_tag':pd.Series(l2_tag),
                              'median_time':pd.Series(median_time),
                              'mad_time':pd.Series(mad_time),
                              'time_factor':pd.Series(time_factor),
                              'attempt_factor':pd.Series(attempt_factor),
                              'timestamp':pd.Series(timestamp),
                              'weighted_score':pd.Series(weighted_score)})
    return hist_user

def calc_quant_ps(hist,n_of_q):
    pred_score_ideal = []
    pred_score_ideal_time = []
    pred_score_actual = []
    pred_score_actual_time = []
    pred_score_ideal_attempt=[]
    pred_score_ideal_time_attempt=[]
    pred_score_actual_attempt=[]
    pred_score_actual_attempt_time=[]
    
    i = 1
    m = len(hist)
    threshold = (n_of_q / 4)
    while i <= m:
    
        # Amongst the first i rows, choose those that have L2 tag as Algebra, Arithmetic, Data Analysis or Geometry and 
        # take the values of weighted_score of the last 'threshold' rows. Similar exercise for difficulty level
        alg_ws = hist.head(n = i)[hist.l2_tag == 'Algebra'].tail(n = threshold).weighted_score.values.tolist()
        arith_ws = hist.head(n = i)[hist.l2_tag == 'Arithmetic'].tail(n = threshold).weighted_score.values.tolist()
        da_ws = hist.head(n = i)[hist.l2_tag == 'Data Analysis'].tail(n = threshold).weighted_score.values.tolist()
        geo_ws = hist.head(n = i)[hist.l2_tag == 'Geometry'].tail(n = threshold).weighted_score.values.tolist()
        
        actual_ws = hist.head(n = i).tail(n = n_of_q).weighted_score.values.tolist()
        
        alg_diff = hist.head(n = i)[hist.l2_tag == 'Algebra'].tail(n = threshold).difficulty.values.tolist()
        arith_diff = hist.head(n = i)[hist.l2_tag == 'Arithmetic'].tail(n = threshold).difficulty.values.tolist()
        da_diff = hist.head(n = i)[hist.l2_tag == 'Data Analysis'].tail(n = threshold).difficulty.values.tolist()
        geo_diff = hist.head(n = i)[hist.l2_tag == 'Geometry'].tail(n = threshold).difficulty.values.tolist()
        
        actual_diff = hist.head(n = i).tail(n = n_of_q).difficulty.values.tolist()
        
        alg_tf = hist.head(n = i)[hist.l2_tag == 'Algebra'].tail(n = threshold).time_factor.values.tolist()
        arith_tf = hist.head(n = i)[hist.l2_tag == 'Arithmetic'].tail(n = threshold).time_factor.values.tolist()
        da_tf = hist.head(n = i)[hist.l2_tag == 'Data Analysis'].tail(n = threshold).time_factor.values.tolist()
        geo_tf = hist.head(n = i)[hist.l2_tag == 'Geometry'].tail(n = threshold).time_factor.values.tolist()
        
        actual_tf = hist.head(n = i).tail(n = n_of_q).time_factor.values.tolist()
        
        alg_af=hist.head(n=i)[hist.l2_tag=='Algebra'].tail(n=threshold).attempt_factor.values.tolist()
        arith_af=hist.head(n=i)[hist.l2_tag=='Arithmetic'].tail(n=threshold).attempt_factor.values.tolist()
        da_af=hist.head(n=i)[hist.l2_tag=='Data Analysis'].tail(n=threshold).attempt_factor.values.tolist()
        geo_af=hist.head(n=i)[hist.l2_tag=='Geometry'].tail(n=threshold).attempt_factor.values.tolist()
        
        actual_af=hist.head(n=i).tail(n=n_of_q).attempt_factor.values.tolist()
        
        # if there are less than 5 questions for any of the topics, then make it up 5 questions with 0 score and medium
        # difficulty level
        while len(alg_ws) < 5:
            alg_ws.append(0)
            alg_diff.append(3)
            alg_tf.append(1)
            alg_af.append(1)
        while len(arith_ws) < 5:
            arith_ws.append(0)
            arith_diff.append(3)
            arith_tf.append(1)
            arith_af.append(1)
        while len(da_ws) < 5:
            da_ws.append(0)
            da_diff.append(3)
            da_tf.append(1)
            da_af.append(1)
        while len(geo_ws) < 5:
            geo_ws.append(0)
            geo_diff.append(3)
            geo_tf.append(1)
            geo_af.append(1)
        while len(actual_ws) < 20:
            actual_ws.append(0)
            actual_diff.append(3)
            actual_tf.append(1)
            actual_af.append(1)
        print(alg_ws)
        total_ws_ideal_time = np.dot(np.array(alg_ws + arith_ws + da_ws + geo_ws),np.array(alg_tf + arith_tf + da_tf + geo_tf))
        total_ws_ideal = np.sum(alg_ws) + np.sum(arith_ws) + np.sum(da_ws) + np.sum(geo_ws)
        total_weight_ideal = np.sum(alg_diff) + np.sum(arith_diff) + np.sum(da_diff) + np.sum(geo_diff)
        total_ws_ideal_time_reattempt = dotproduct(np.array(alg_ws + arith_ws + da_ws + geo_ws),np.array(alg_tf + arith_tf + da_tf + geo_tf),np.array(alg_af+arith_af+da_af+geo_af))
        total_ws_ideal_reattempt=np.dot(np.array(alg_ws+arith_ws+da_ws+geo_ws),np.array(alg_af+arith_af+da_af+geo_af))
               
        total_ws_actual_time = np.dot(np.array(actual_ws),np.array(actual_tf))
        total_ws_actual_time_reattempt=dotproduct(np.array(actual_ws),np.array(actual_tf),np.array(actual_af))
        total_ws_actual = np.sum(actual_ws)
        total_weight_actual = np.sum(actual_diff)
        total_ws_actual_reattempt=np.dot(np.array(actual_ws),np.array(actual_af))
        
        pred_score_ideal_time.append(int(130 + ((total_ws_ideal_time * 1. / total_weight_ideal)*40)))
        pred_score_ideal.append(int(130 + ((total_ws_ideal * 1. / total_weight_ideal)*40)))
        pred_score_actual_time.append(int(130 + ((total_ws_actual_time * 1. / total_weight_actual)*40)))
        pred_score_actual.append(int(130 + ((total_ws_actual * 1. / total_weight_actual)*40)))
        pred_score_ideal_time_attempt.append(int(130+((total_ws_ideal_time_reattempt*1./total_weight_ideal)*40)))
        pred_score_ideal_attempt.append(int(130+((total_ws_ideal_reattempt*1./total_weight_ideal)*40)))
        pred_score_actual_attempt_time.append(int(130+((total_ws_actual_time_reattempt*1./total_weight_actual)*40)))
        pred_score_actual_attempt.append(int(130+((total_ws_actual_reattempt*1./total_weight_actual)*40)))
        
        i=i+1
                                        
    hist['ps_ideal_' + str(n_of_q) + '_time'] = pd.Series(pred_score_ideal_time,index = hist.index)
    hist['ps_ideal_' + str(n_of_q)] = pd.Series(pred_score_ideal,index = hist.index)
    hist['ps_actual_' + str(n_of_q) + '_time'] = pd.Series(pred_score_actual_time,index = hist.index)
    hist['ps_actual_' + str(n_of_q)] = pd.Series(pred_score_actual,index = hist.index)
    hist['ps_ideal_'+str(n_of_q)+'_time'+'_reattempt']=pd.Series(pred_score_ideal_time_attempt,index=hist.index)
    hist['ps_ideal_'+str(n_of_q)+'_reattempt']=pd.Series(pred_score_ideal_attempt,index=hist.index)
    hist['ps_actual_'+str(n_of_q)+'_reattempt']=pd.Series(pred_score_actual_attempt,index=hist.index)
    hist['ps_actual_'+str(n_of_q)+'_time'+'_reattempt']=pd.Series(pred_score_actual_attempt_time,index=hist.index)
                                
    return hist

def const_user_table(uid):
    int_data = pd.read_csv('~/Downloads/int_data.csv')
    quant_q = pd.read_csv('~/Downloads/quant_question_data.csv')
    reattempt_data=pd.read_csv('~/Desktop/reattempt_factor_final.csv')
    quant_q = set_med_mad(quant_q)
    hist_user_data = attempt_hist_user(int_data,quant_q,reattempt_data,uid)
    hist_user_data = calc_quant_ps(hist_user_data,20)
    hist_user_data = calc_quant_ps(hist_user_data,40)
    #hist_user_data = calc_quant_ps_40(hist_user_data)
    #hist_user_data.to_csv('~/Downloads/pred_score_with_reattempt_'+str(uid)+'.csv')
    return hist_user_data

In [0]:
int_data = pd.read_csv('~/Downloads/int_data.csv')
quant_q = pd.read_csv('~/Downloads/quant_question_data.csv')
reattempt_data=pd.read_csv('~/Desktop/reattempt_factor_final.csv')
set_of_users = set(int_data.uid)
set_of_questions = set(quant_q.entrayn_glu_id)

list_of_users = list(set_of_users)
list_of_questions = list(set_of_questions)

def get_standard_error_user_degree(user_table,column_name,degree):
    x = user_table.index.tolist()
    y = user_table[column_name].tolist()
    coeff = np.polyfit(x,y,degree).tolist()
    num_of_coeff = len(coeff)
    j = 0
    trend_data = []
    while j < len(user_table):
        i = 0
        y_out = 0
        while i < num_of_coeff:
            y_out = y_out + ((x[j]**(num_of_coeff - i - 1))*coeff[i])
            i += 1
        trend_data.append(y_out)
        j += 1
    user_table['trend_data'] = pd.Series(trend_data, index = user_table.index)
    i = 0
    squared_diff = []
    while i < len(user_table):
        squared_diff.append((user_table['trend_data'][i] - user_table[column_name][i])**2)
        i += 1
    user_table['squared_diff_'+column_name+'_'+str(degree)] = pd.Series(squared_diff, index = user_table.index)
    standard_error = np.mean(user_table['squared_diff_'+column_name+'_'+str(degree)])
    return standard_error

In [0]:
set_of_users = set(int_data.uid.tolist())
list_of_users = list(set_of_users)
set_of_quant_q = set(quant_q.entrayn_glu_id.tolist())
n_quant_q = []
i = 0
while i < len(list_of_users):
    n_quant_q.append(0)
    i += 1
df = pd.DataFrame({'uid':pd.Series(list_of_users),
                 'quant_q_attempted':pd.Series(n_quant_q)})
start = time.time()
i = 0
n = len(int_data)
while i < n:
    #print i, time.time() - start
    if int_data.entity_id[i] in set_of_quant_q:
        df.quant_q_attempted[df.uid == int_data.uid[i]] = df.loc[df.uid == int_data.uid[i]].quant_q_attempted + 1
    i += 1
#print time.time() - start

In [0]:
set_of_users_no_quant = set(df.loc[df.quant_q_attempted == 0].uid.values.tolist())
users_to_run_through = set_of_users - set_of_users_no_quant
users_to_run_through = list(users_to_run_through)

list_of_valid_users = pd.DataFrame({'uid':pd.Series(users_to_run_through)})
for i in range(1,6):
        list_of_valid_users['std_error_id_40_'+str(i)] = 0.
        list_of_valid_users['std_error_id_40_time_'+str(i)] = 0.
        list_of_valid_users['std_error_ac_40_'+str(i)] = 0.
        list_of_valid_users['std_error_ac_40_time_'+str(i)] = 0.
        list_of_valid_users['std_error_id_40_time_attempt_'+str(i)]=0.
        list_of_valid_users['std_error_id_40_attempt_'+str(i)]=0.
        list_of_valid_users['std_error_ac_40_attempt_'+str(i)]=0.
        list_of_valid_users['std_error_ac_40_attempt_time_'+str(i)]=0.
        
        
                            

In [0]:
start = time.time()
j = 0
n = len(list_of_valid_users)
while j < n:
    user_table = const_user_table(list_of_valid_users.uid[j])
    for i in range(1,6):
        list_of_valid_users['std_error_id_40_'+str(i)][j] = get_standard_error_user_degree(user_table,'ps_ideal_40',i)
        list_of_valid_users['std_error_id_40_time_'+str(i)][j] = get_standard_error_user_degree(user_table,'ps_ideal_40_time',i)
        list_of_valid_users['std_error_ac_40_'+str(i)][j] = get_standard_error_user_degree(user_table,'ps_actual_40',i)
        list_of_valid_users['std_error_ac_40_time_'+str(i)][j] = get_standard_error_user_degree(user_table,'ps_actual_40_time',i)
        list_of_valid_users['std_error_id_40_time_attempt_'+str(i)][j]=get_standard_error_user_degree(user_table,'ps_ideal_40_time_reattempt',i)
        list_of_valid_users['std_error_id_40_attempt_'+str(i)][j]=get_standard_error_user_degree(user_table,'ps_ideal_40_reattempt',i)
        list_of_valid_users['std_error_ac_40_attempt_'+str(i)][j]=get_standard_error_user_degree(user_table,'ps_actual_40_reattempt',i)
        list_of_valid_users['std_error_ac_40_attempt_time_'+str(i)][j]=get_standard_error_user_degree(user_table,'ps_actual_40_time_reattempt',i)
        
    j += 1
#print time.time() - start

In [0]:
a = list_of_valid_users.describe()
#list_of_valid_users.to_csv('~/Downloads/list_of_valid_users.csv')
a.to_csv('~/Downloads/describe_stats_with_reattempt_factor1.csv')

In [0]:
int_data = pd.read_csv('~/Downloads/int_data.csv')
quant_q = pd.read_csv('~/Downloads/quant_question_data.csv')
reattempt_data=pd.read_csv('~/Desktop/reattempt_factor_final.csv')
i =1000
a = reattempt_data.loc[reattempt_data.uid == int_data.uid[i]]
(a.loc[a.q_id==15505].no_of_attempts.values.tolist())

1560    2
Name: no_of_attempts, dtype: int64

In [0]:
int_data.uid[5]

In [0]:
int_data.entity_id[i]

In [0]:
int_data = pd.read_csv('~/Downloads/int_data.csv')
quant_q = pd.read_csv('~/Downloads/quant_question_data.csv')
reattempt_data=pd.read_csv('~/Desktop/reattempt_factor_final.csv')
set_of_quant_q = set(quant_q.entrayn_glu_id.tolist())
return N
         


In [0]:
attempts=(a.loc[a.q_id==int_data.entity_id[i]].no_of_attempts)
print (attempts)

In [0]:
import numpy as np
a=np.array([1,4,5])
b=np.array([1,4,5])
np.cross(a,b)

In [0]:
i=1
n=3
sum=0
def matrix_multiplication(a,b,c):
    global sum
    global i
    for i in range(3):
        sum=sum+a[i]*b[i]*c[i]
    return sum

In [0]:
matrix_multiplication(np.array([7,2,3]),np.array([1,2,3]),np.array([1,2,3]))

In [0]:
import numpy as np
import reduce
V1=(1,2,3)
V2=(1,2,3)
V3=(1,2,3)
sum (i*j for (i,j) in zip(V1,V2,V3))

In [0]:
import numpy as np
np.array([1+2],[3+4])

In [0]:
np.array([[1,2.0],[0,0],(1+1j,3.)])

In [0]:
len(alg_ws)==len(geo_ws)

In [0]:
def dotproduct(a,b,c):
    n = len(a)
    sum = 0
    for i in range(n):
        sum=sum+a[i]*b[i]*c[i]
    return sum

In [0]:
dotproduct([1,2,3],[1,3],[1,2,3])