In [7]:
#include all the necessary libraries
import pandas as pd
import numpy as np
import unicodecsv
import random
import operator
import math

In [8]:
#getdata() function definition
def getdata(filename):
    df = pd.read_csv(filename)              #read the specified input file
    df.rename(                              #renaming columns to make it easier for coding 
    columns={
        "Operator": "op",
        "In Out Travelling": "inOut",
        "Network Type": "nwrk",
        "Latitude": "lat",
        "Longitude": "long"
    },
    inplace=True                            #doing the renaming inplace
    )
    return df

In [9]:
#Function to get the euclidean distance between two entries x and y
def euclideanDist(x, y):
    d = 0.0                                                 #initializing the distance equal to 0
    d += pow((float(x['lat'])-float(y['lat']))/10,2)        #summing the squared difference in the location coordinates
    d += pow((float(x['long'])-float(y['long']))/10,2)
    d = math.sqrt(d)                                        #taking the squareroot of the sum
    return d                                                #return the final distance

In [10]:
#Function to generate the report for the predicted results
def report(k_value,month):    
    ou_df=pd.read_csv('output_'+str(k_value)+'_'+month+'_.csv')  #getting the predicted results
    tst_df=pd.read_csv('testing1.csv')                           #getting the observed results 

    r=0.0          #stores the no. of entries whose rating was predicted correctly
    v=0.0          #stores the no. of entries whose call drop category was predicted correctly
    t=0.0          #stores the total no. of entries in the testing dataset
    
    #looping through all the entries in the testing data set
    #if rating observed == rating predicted increment r
    #if category observed == category predicted increment v
    
    for i in ou_df.index:
        if(ou_df.loc[i]["Rating"]==tst_df.loc[i]["Rating"]):
            r+=1
        if(ou_df.loc[i]["Call Drop Category"]==tst_df.loc[i]["Call Drop Category"]):
            v+=1
        t+=1
        
    ra=(r/t)*100    #stores the rating prediction accuracy
    va=(v/t)*100    #stores the category predictioin accuracy
    print()
    print(r,v,t)
    print("Rating Acc",ra)
    print("Cat Acc",va)
    a=[[ra,va,r,v,t,k_value,month]]
    
    #exporting results in a csv files
    df1 = pd.DataFrame(a, columns = ["Rating Accuracy", "Category Accuracy","r","c","t","k_value","Month"])
    df1.to_csv('report.csv',index=False,mode='a',header=True)    


In [15]:
#KNN prediction and model training
#Function takes training and testing data set , month name and k_value as input
def knn_predict(test_data, train_data_in,month,k_value):
    out=[]                 #array to store the predicted rating and category for entries as pair
    for i in test_data.index:
        
        eu_Distance =[]     #array to store tuples containing rating, category and distance from current entry that is being processed 
        knn = []            #stores the K nearest tuples to the current entry 
        v=[0,0,0,0]         #array to get the frequency of each call catgory value
        r=[0,0,0,0,0,0]     #array to get the frequency of each rating value
        
        print(i,end=' ')    #prints the id of the entry that is being processed
        inp=test_data.loc[i].to_dict() #inp is a dictionary to store the input test entry
        df=train_data_in
        #Filtering the traing data based on the input
        train_data=df[(df['op']==inp['op']) &(df['inOut']==inp['inOut']) &(df['nwrk']==inp['nwrk'])]
        train_data.reset_index(drop=True,inplace=True)
        train_data.reset_index(inplace=True)
        
        #looping through filtered training data to get rating,category and distance corresponding to each training entry
        for j in train_data.index:
            x=test_data.loc[i].to_dict()
            y=train_data.loc[j].to_dict()
            eu_dist = euclideanDist(x, y)
            eu_Distance.append(list((train_data.loc[j]['Rating'],train_data.loc[j]["Call Drop Category"], eu_dist)))
        
        #sorting the tuples in eu_Distance based on the distance
        eu_Distance.sort(key = operator.itemgetter(2))
        #storing the nearest k entries in the knn array
        knn = eu_Distance[:k_value]
       
        #Getting the maximum occuring rating and category value among the k neighbours
        for k in knn:
            r[int(k[0])]+=1
            v[int(k[1])]+=1
   
        rating=r.index(max(r))
        category=v.index(max(v))
        out.append((rating,category))
    
    #exporting the results to a csv file
    df1 = pd.DataFrame(out, columns = ['Rating', "Call Drop Category"])
    df1.to_csv('output_'+str(k_value)+'_'+month+'_.csv',index=False)
    report(k_value,month)
          


#### We are providing a sample test file and training file to run the code. Running on original data will take atleast 3-4 hours.This is a sample which we are hardcoding so that demo becomes easy.

In [16]:
month ='default_month'
tst_df=getdata('testing1.csv')
trn_df=getdata('training.csv')
print()
knn_predict(tst_df,trn_df,month,101)


0 1 2 3 4 5 6 7 8 
3.0 9.0 9.0
Rating Acc 33.33333333333333
Cat Acc 100.0
