### Imports/Setting Up and Cleaning Data


In [None]:
# import all necessary packages

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing, svm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as rmse

In [None]:
# read data

data = pd.read_csv('data.csv')

# select 2 attributes from the data

data_small = data[['MP', 'PTS']]


# display a small amount of the data using .head() function
data_small.head()

In [None]:
# plot the data in a simple scatter plot to visulize

sns.lmplot(x="MP", y="PTS", data = data_small, order = 3, ci=None)

In [None]:
# Clean the data by removing N/A and missing input numbers

data_small.fillna(method = 'ffill', inplace = True)

### Training/Visualizing the Model

In [None]:
# Convert each dataframe into a numpy array

X = np.array(data_small['MP']).reshape(-1, 1) # np array containing "MP" attribute
y = np.array(data_small['PTS']).reshape(-1, 1) # np array containing "PTS" attribute

# Split the data into training and test data, test data will be  of all data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
# Fit model and calculate Linear Regression score

regression = LinearRegression()
regression.fit(X_train, y_train)

print(regression.score(X_test, y_test))

In [None]:
# Initialize a variable as the predicted values
y_pred = regression.predict(X_test)

# Scatter plot of the results
plt.scatter(X_test, y_test, color = 'red')

# Predicted line of best fit
plt.plot(X_test, y_pred, color='blue')

In [None]:
# Calculate the RMSE

# y = data_small['PTS']

print(rmse(X_test, y_pred, squared=False))