In [None]:
import random
import numpy as np
import pandas as pd

def make_random_nan(elements: np.ndarray,nan_count: int):
    """Creates Random NaN values"""
    elements = elements.tolist()
    element_length = len(elements)
    if nan_count>element_length:
        raise ValueError("NaN count cannot be greater than elements count")
    nan_indices = random.sample(range(element_length), nan_count)
    for idx in nan_indices:
        elements[idx] = np.nan
    return elements

##### Handeling Missing Data 
* df.dropna() → removes rows with missing values.
* df.dropna(axis=1) → removes columns with missing values.
* df.dropna(thresh=n) → keeps rows/columns with at least n non-null values.(axis=0 is row, axis=1 is col)

#### Steps below:
* Create a dummy data.
* Play with those methods(handeling missing values).

In [None]:
#create random dummy features
feature1 = make_random_nan(np.random.randint(0,10, 100),nan_count=10)
feature2 = make_random_nan(np.random.randint(0,10, 100),nan_count=14)
feature3 = make_random_nan(np.random.randint(0,10, 100),nan_count=80)

In [None]:
dummy_data = pd.DataFrame({"f1":feature1,"f2":feature2,"f3":feature3})

#### Play around with different methods for filling NaN values:
#### Statstical filling:
* ```df['col'].fillna(df['col'].mean())```    # mean imputation
* ```df['col'].fillna(df['col'].median())```  # median imputation
* ```df['col'].fillna(df['col'].mode()[0])``` # mode imputation

#### Forward or Backward Filling:
* ```df.fillna(method='ffill')```  # propagate previous value forward
* ```df.fillna(method='bfill')```  # propagate next value backward

In [None]:
dummy_data["f3"] = dummy_data["f3"].fillna(dummy_data["f3"].mean()) #fill f3 with its mean

In [None]:
df

#### Machine Learning:

* Linear Regression

In [None]:
#data preprocess
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

#algorithm
from sklearn.linear_model import LassoCV
from sklearn.linear_model import LinearRegression

#metrics
from sklearn.metrics import r2_score, mean_squared_error

df = pd.read_csv("housing_price_dataset.csv")

X = pd.get_dummies(df.drop(columns=["Price"]), drop_first=False)
y = df["Price"].values

Xs = StandardScaler(with_mean=True).fit_transform(X)
train_x, test_x, train_y, test_y = train_test_split(Xs,y,test_size=0.2)

model = LinearRegression()
model.fit(train_x, train_y)

predictions = model.predict(test_x)

print(f"R2 score: {r2_score(test_y,predictions)}")

In [None]:
#toy problem to play with
#given the muscle mass we predict athletcic performance

muscle_mass          = [12,13,11,15,16,20,25,27]
real_ath_performance = [3,4,5,6.4,7.2,7.9,9,12]

pred_ath_performance = [3.5,2,4.8,6,6.8,7.5,10,11.7]

var_ath =  np.mean(real_ath_performance)/len(real_ath_performance)
var_fit =  (np.mean((np.array(real_ath_performance)-np.array(pred_ath_performance))**2))/len(real_ath_performance)
r2_score_ = (var_ath-var_fit)/var_ath

##### Assignment:
* Play with the above linear regression model by selecting different features and observe the metrics.
* In the toy problem try to understand how r2 score is changing when you plug and play different values.
* Write a function that takes a two list list as input arguments and computes r2 score.
* Try understanding Grad Descent from your point of mathematical sense(optional).