In [1]:
import pandas as pd
import json
import math

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
subjects = ['Physics', 'Chemistry', 'ComputerScience', 'Hindi', 
            'Biology', 'PhysicalEducation', 'Economics', 
            'Accountancy', 'BusinessStudies', 'English', 'Mathematics']

In [3]:
train = []
with open('training.json') as file:
    _ = file.readline()
    for line in file.readlines():
        marks = json.loads(line)
        del marks['serial']
        for subject in subjects:
            if subject not in marks:
                marks[subject] = 0
        train.append(marks)

In [4]:
x_train = pd.DataFrame(train, columns=subjects)
y_train = x_train['Mathematics']
x_train.head()

Unnamed: 0,Physics,Chemistry,ComputerScience,Hindi,Biology,PhysicalEducation,Economics,Accountancy,BusinessStudies,English,Mathematics
0,8,7,0,0,0,3,0,0,0,4,6
1,1,1,0,0,0,1,0,0,0,3,3
2,1,2,0,0,0,2,0,0,0,1,2
3,8,7,0,0,0,6,0,0,0,7,7
4,1,1,0,0,0,1,0,0,0,3,2


In [5]:
x_train = x_train.drop('Mathematics', axis=1)

In [6]:
test = []
with open('sample-test.in.json') as file:
    _ = file.readline()
    for line in file.readlines():
        marks = json.loads(line)
        del marks['serial']
        for subject in subjects:
            if subject not in marks:
                marks[subject] = 0
        test.append(marks)

In [7]:
x_test = pd.DataFrame(test, columns=subjects[:-1])
x_test.head()

Unnamed: 0,Physics,Chemistry,ComputerScience,Hindi,Biology,PhysicalEducation,Economics,Accountancy,BusinessStudies,English
0,2,2,0,0,1,0,0,0,0,1
1,3,3,4,0,0,0,0,0,0,4
2,0,0,0,0,0,0,3,1,1,1
3,2,2,2,0,0,0,0,0,0,1
4,0,0,0,0,0,0,3,5,4,2


In [8]:
actual = []
with open('sample-test.out.json') as file:
    for line in file.readlines():
        actual.append(int(json.loads(line)))

In [9]:
regressor = LinearRegression()
regressor.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [10]:
predict = regressor.predict(x_test)

In [11]:
math.sqrt(mean_squared_error(actual, predict))

1.5339455147908452

In [12]:
print('Intercept: ', regressor.intercept_)
print('Coefficient: ', regressor.coef_)

Intercept:  0.691968205775058
Coefficient:  [3.55137581e-01 3.90069586e-01 1.08369376e-01 1.11022302e-16
 1.68648162e-01 1.07352663e-01 2.85098020e-01 6.58613297e-01
 3.19266029e-01 5.54039127e-02]


In [13]:
df = pd.DataFrame({'Actual': actual, 'Predicted': predict.flatten()})
print(df.head())

   Actual  Predicted
0       2   2.406435
1       4   3.582683
2       1   2.580546
3       2   2.454525
4       8   6.228201
