# Importing Packages
## Pandas, Numpy and Scikit-Learn

In [13]:
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn import linear_model
from sklearn.utils import shuffle

# Load the data set
## Use pandas module to load the data set

In [14]:
data = pd.read_csv("student-mat.csv", sep=";")

## Show the data frame
### First 5 rows(students)

In [15]:
print(data.head())

  school sex  age address famsize Pstatus  Medu  Fedu     Mjob      Fjob  ...  \
0     GP   F   18       U     GT3       A     4     4  at_home   teacher  ...   
1     GP   F   17       U     GT3       T     1     1  at_home     other  ...   
2     GP   F   15       U     LE3       T     1     1  at_home     other  ...   
3     GP   F   15       U     GT3       T     4     2   health  services  ...   
4     GP   F   16       U     GT3       T     3     3    other     other  ...   

  famrel freetime  goout  Dalc  Walc health absences  G1  G2  G3  
0      4        3      4     1     1      3        6   5   6   6  
1      5        3      3     1     1      3        4   5   5   6  
2      4        3      2     2     3      3       10   7   8  10  
3      3        2      2     1     1      5        2  15  14  15  
4      4        3      2     1     2      5        4   6  10  10  

[5 rows x 33 columns]


# Trim the data
### To show important attributes that are used and remove irrelevant ones 
#### Here we trim our data into 6 attributes

In [16]:
data = data[["G1", "G2", "G3", "studytime", "failures", "absences"]]
print(data.head())

   G1  G2  G3  studytime  failures  absences
0   5   6   6          2         0         6
1   5   5   6          2         0         4
2   7   8  10          2         3        10
3  15  14  15          3         0         2
4   6  10  10          2         0         4


# Seperate data into 4 arrays

#### Define the attribute we are predicting. In this case, it is G3. This attribute, is known as the *label*.

In [17]:
predict = "G3" # the label

#### Other attributes in the data frame are used to determine our labels, and are known as *features*.  Numpy module is used to seperate data into two arrays, one with features, the other with labels.

In [23]:
x = np.array(data.drop([predict], 1))  # features
y = np.array(data[predict])  # label(s)

## Split data into testing and training data 
#### Here, 80% of data is used to train while the 20% is for testing (validation).  This highly depends on the amount of data you have. The bigger the data set, the bigger the validation set can be.

#### We do this so that the model does not test data on data it has already seen (memorized), which will make the prediction inaccurate. 

In [55]:
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(x, y, test_size=0.1)  # 10% test size

# 1. Linear Regression Algorithm

#### Linear regression is essentialy an algorithm that creates a best fit line through data points in a model

## Implementing the algorithm

In [56]:
linear = linear_model.LinearRegression()

### Training and scoring the model

In [57]:
linear.fit(x_train, y_train)  # fitting algorithm onto test set (training)
acc = linear.score(x_test, y_test)  # scoring the model
print(acc)  # score will vary as it is random

0.9170672638971288


## Viewing the constants
#### showing the linear regression line slope values and intercept. 5 slope values (weights) corresponds to a 5 dimension space graph.

In [60]:
print('Coefficients: ', linear.coef_)  # slope of values
print('Intercepts: ', linear.intercept_)  # intercept

Coefficients:  [ 0.15529918  0.9839866  -0.20981952 -0.32538579  0.03787158]
Intercepts:  -1.5385664814951987


# Predicting a students test score

#### After seeing the score of the algorithm, its time to use the algorithm to perform predictions.

In [66]:
score_prediction = linear.predict(x_test)  # Gets the list of all the predictions
for x in range(len(score_prediction)):  # a for loop to loop over list of predictions
    print(int(score_prediction[x]), x_test[x], y_test[x])

8 [8 9 2 0 2] 8
9 [11 10  2  0  0] 10
12 [12 13  2  0  2] 12
5 [7 7 3 0 0] 8
12 [13 13  2  0  2] 11
15 [16 15  2  0  2] 15
9 [ 9 10  3  0  9] 9
12 [11 13  2  0  2] 14
0 [12  0  3  2  0] 0
6 [ 7  8  2  3 10] 10
7 [10  8  2  0 10] 8
8 [10  9  3  1 28] 9
15 [16 15  4  0  7] 17
19 [18 18  1  0  8] 18
18 [18 18  1  0  6] 18
7 [10  8  2  0 14] 9
18 [18 18  3  0  5] 19
15 [14 16  3  0  0] 16
14 [13 14  1  0  0] 13
8 [ 6 10  2  0  4] 10
12 [12 12  1  0  4] 13
9 [11 10  1  0  0] 10
8 [9 9 2 0 4] 10
19 [17 18  2  0 21] 18
10 [13 10  2  1 22] 11
4 [ 6  5  1  1 14] 5
13 [13 13  2  0 23] 13
12 [14 12  2  1  8] 12
4 [ 6  5  1  0 14] 5
12 [12 13  1  1  6] 14
14 [14 14  1  0  2] 14
10 [11 11  2  0  4] 11
11 [13 12  3  0  6] 12
8 [10  9  2  0  0] 9
16 [17 16  2  0  0] 17
13 [12 13  2  0 14] 12
6 [8 7 1 1 7] 8
9 [ 8 10  2  0  6] 10
7 [8 8 2 0 0] 9
12 [14 12  2  0 20] 13
