In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from accuracy_calc import *
import sqlalchemy

In [2]:
df = pd.read_csv("features.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,question_id,correctly_answered,incorrectly_answered,not_answered,avg_marks_correct,avg_marks_incorrect,avg_marks_na,f1,f2,perc_corr,perc_na,perc_incorr
0,0,1,144,91,34,30.4167,17.7253,24.6855,0.007733,0.278695,25,6,1
1,1,2,114,88,53,29.8246,15.0341,19.9795,0.006053,0.2082,13,0,2
2,2,3,67,132,48,31.1045,21.3409,26.5354,0.003645,0.13186,11,11,12
3,3,4,146,65,54,34.8288,15.3692,22.0449,0.0085,0.340585,41,6,2
4,4,5,82,97,63,31.6951,21.3093,27.0053,0.004511,0.178583,21,18,8


In [4]:
df.drop("Unnamed: 0",axis=1,inplace=True)

In [5]:
col_vals = list(df.columns.values)

In [6]:
ss = MinMaxScaler()
data = ss.fit_transform(np.array(df[col_vals[1:12]]))
data[0]

array([0.39940828, 0.2724359 , 0.29824561, 0.56506579, 0.51733735,
       0.59946264, 0.42556471, 0.45540564, 0.43859649, 0.15384615,
       0.02702703])

In [81]:
n_clusters = 3
features = len(data[0])
total_data = len(data)
W = []

In [16]:
def get_weights(total_data):
    '''
    function for initialize random values in the weight vectors for the neural network to be used.
    uses the no of features to initialize a vector.
    '''
    y = np.random.random()*(2.0/np.sqrt(total_data))
    return 0.5 - (1/np.sqrt(total_data)) + y 

In [83]:
'''
Using the above function to initialize the weight vectors
'''
for i in range(n_clusters):
    W.append(list())
    for j in range(features):
        W[i].append(get_weights() * 0.5)

In [84]:
W

[[0.2555237560289629,
  0.25223635191258925,
  0.24102473695737708,
  0.25515116707828167,
  0.24229903814470533,
  0.25200621034550197,
  0.23978730277346097,
  0.25867184906231566,
  0.2409201305398317,
  0.2519038336141455,
  0.2386874120347491],
 [0.2508946456869702,
  0.2458997733876654,
  0.251159332793222,
  0.2582695663211468,
  0.2508801293476367,
  0.2563892370781587,
  0.25129078452283177,
  0.2463051314720393,
  0.24775153231964853,
  0.24988826082485552,
  0.23875657652444127],
 [0.25147468570776016,
  0.2400584962302346,
  0.2510743156167749,
  0.25165339152813393,
  0.2601248278056805,
  0.2495500415433837,
  0.2571481360053348,
  0.24766253608215752,
  0.2508060114433547,
  0.2455484560101217,
  0.25132065140483345]]

In [8]:
def compute_distance(w,x):
    '''
    function for computing the distance between the x(data) and w(Weight) vector
    takes in two arguments 
    w: weights
    x: features
    '''
    distance=0
    for i in range(len(w)):
        distance = distance + (w[i] - x[i])*(w[i] - x[i])
    distance = np.sqrt(distance)
    return distance

In [9]:
def find_closest_to_x(W,x):
    '''
    function to calculate the closest x vectors to the w vectors
    takes in two arguments
    w: weights
    x: features
    '''
    
    w = W[0]
    dist = compute_distance(w,x)
    i = 0
    i_n = i
    for w_ in W:
        if compute_distance(w_,x)<dist:
            dist = compute_distance(w_, x)
            w = w_
            i_n = i
        i = i + 1
    return (w,i_n)

**After receiving an input vector x, the winning neuron modifies the value of its previous vector w in a loop according to the formula *wn = wn+λ (xn-wn)*, where λ is a coefficient, which we reduce by Δλ in each iteration of the loop unless λ>0. We do this for each x in our training set. We can pick input vectors randomly or in a specific order. In this loop, λ and Δλ are our parameters, which we define and can modify.**



In [20]:
def fit_predict(data):
    W=[]
    n_clusters = 3
    features = len(data[0])
    total_data = len(data)
    for i in range(n_clusters):
        W.append(list())
        for j in range(features):
            W[i].append(get_weights(total_data) * 0.5)
    la = 0.3   # λ coefficient
    dla = 0.05  # Δλ
    '''
    This code applies the training process defined above for every data point given in the dataset.
    We run a loop till la is equal to 0. In that we take 10 iterations and find closest datapoint ot the neuron and then
    updates the value of the wn as in the above equation.
    '''
    while la >= 0:
        for k in range(10):
            for x in data:
                wm = find_closest_to_x(W, x)[0]
                for i in range(len(wm)):
                    wm[i] = wm[i] + la * (x[i] - wm[i]) 
        la = la - dla
    prediction=[]
    for x in data:
        i_n = find_closest_to_x(W,x)[1]
        prediction.append(i_n)
    return prediction

In [21]:
#Bringing in the original tags
conn = sqlalchemy.create_engine("mysql+pymysql://anuj:Anuj@21101998@localhost/auto_tagging_data")
df = pd.read_sql("question_master",conn)
tags = list(df["pre_tag"])

In [22]:
predictions = fit_predict(data)

In [24]:
print(acc(np.array(tags),np.array(predictions))," and the distribution is ",predictions.count(2),"hard ",predictions.count(1),"medium ",predictions.count(0),"easy")

0.53  and the distribution is  651 hard  642 medium  507 easy


