<a href="https://colab.research.google.com/github/annaqas/projects_codecademy/blob/main/Linear_Regression_at_Codecademy_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Load libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

# Read in the data
codecademy = pd.read_csv('/content/sample_data/codecademy.csv')

# Print the first five rows
print(codecademy.head())
# Create a scatter plot of score vs completed
plt.scatter(codecademy.completed, codecademy.score)
# Show then clear plot
plt.show()
plt.clf()
# Fit a linear regression to predict score based on prior lessons completed
model = sm.OLS.from_formula('score ~completed', data = codecademy)
results = model.fit()
print(results.params)

# Intercept interpretation:
# A person with 0 completed items on Codecademy have the 13.21 score.

# Slope interpretation:
# For each 1 item completed, 1.31 point increase on score

# Plot the scatter plot with the line on top
plt.scatter(codecademy.completed, codecademy.score)
plt.plot(codecademy.completed, results.predict(codecademy))
# Show then clear plot
plt.show()
plt.clf()
# Predict score for learner who has completed 20 prior lessons
pred20 = results.params[0] + results.params[1]*20
# Calculate fitted values
fitted_values = results.predict(codecademy)

# Calculate residuals
residuals = codecademy.score - fitted_values
# Check normality assumption
plt.hist(residuals)

# Show then clear the plot
plt.show()
plt.clf()
# Check homoscedasticity assumption
plt.scatter(fitted_values, residuals)
# Show then clear the plot
plt.show()
plt.clf()

# Create a boxplot of score vs lesson
sns.boxplot(codecademy.lesson, codecademy.score)
# Show then clear plot
plt.show()
plt.clf()

# Fit a linear regression to predict score based on which lesson they took
model = sm.OLS.from_formula('score ~ lesson', data = codecademy)
results = model.fit()
print(results.params)
# Calculate and print the group means and mean difference (for comparison)
mean_score_lesson_A = np.mean(codecademy.score[codecademy.lesson == 'Lesson A'])
mean_score_lesson_B = np.mean(codecademy.score[codecademy.lesson == 'Lesson B'])
print('Mean score (Lesson A): ', mean_score_lesson_A)
print('Mean score (Lesson B): ', mean_score_lesson_B)
print('The mean diference of the lesson A and lesson B is: ' + str(mean_score_lesson_A - mean_score_lesson_B))

# Use `sns.lmplot()` to plot `score` vs. `completed` colored by `lesson`
sns.lmplot(x = 'completed', y = 'score', hue = 'lesson', data = codecademy)
plt.show()