# Notes

## Imports

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import statsmodels.api as sm
import seaborn as sns
sns.set()
from sklearn.linear_model import LinearRegression

## Loading Data

### CSV file with pandas

In [2]:
data = pd.read_csv("real_estate_price_size_year.csv")

### Get a preview

In [3]:
data.head()

Unnamed: 0,price,size,year
0,234314.144,643.09,2015
1,228581.528,656.22,2009
2,281626.336,487.29,2018
3,401255.608,1504.75,2015
4,458674.256,1275.46,2009


### General data

In [4]:
data.describe()

Unnamed: 0,price,size,year
count,100.0,100.0,100.0
mean,292289.47016,853.0242,2012.6
std,77051.727525,297.941951,4.729021
min,154282.128,479.75,2006.0
25%,234280.148,643.33,2009.0
50%,280590.716,696.405,2015.0
75%,335723.696,1029.3225,2018.0
max,500681.128,1842.51,2018.0


## OLS Regression

In [5]:
y = data["price"]
x1 = data[["size", "year"]]

x = sm.add_constant(x1)
results = sm.OLS(y, x).fit()
results.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.776
Model:,OLS,Adj. R-squared:,0.772
Method:,Least Squares,F-statistic:,168.5
Date:,"Wed, 13 May 2020",Prob (F-statistic):,2.7700000000000004e-32
Time:,20:18:12,Log-Likelihood:,-1191.7
No. Observations:,100,AIC:,2389.0
Df Residuals:,97,BIC:,2397.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-5.772e+06,1.58e+06,-3.647,0.000,-8.91e+06,-2.63e+06
size,227.7009,12.474,18.254,0.000,202.943,252.458
year,2916.7853,785.896,3.711,0.000,1357.000,4476.571

0,1,2,3
Omnibus:,10.083,Durbin-Watson:,2.25
Prob(Omnibus):,0.006,Jarque-Bera (JB):,3.678
Skew:,0.095,Prob(JB):,0.159
Kurtosis:,2.08,Cond. No.,941000.0


## Dummy Variable Mapping

In [6]:
# data = pd.read_csv("file.csv")
# data["category"] = data["category"].map({"yes": 1, "no": 0})

## Making Predictions based on Regressions

In [7]:
#predictions =  results.predict(data)

## Same Regression Using SKLearn

In [8]:
#x_matrix = x.values.reshape(len, 1)
#reg = LinearRegression()
#reg.fit(x, y)

### R-Squared

In [9]:
#reg.score(x_matrix, y)

#Adjusted R Square
# r2 = re.score(x, y)
# n = x.shape[0]
# p = x.shape[1]
# adjusted_r2=1-(1-r2)*(n-1)/(n-p-1)

### Coefficients

In [10]:
#reg.coef_

### Intercepts

In [11]:
#reg.intercept_

### Making Predictions

In [12]:
#reg.predict(1740)

#new_data = pd.DataFrame(data=[1740, 1760], columns=["SAT"])
#reg.predict(new_data)

## Feature Selection 

In [13]:
# from sklearn.feature_selection import f_regression
# f_regression(x, y)
# p_values = f_regression(x, y)[1]
# p_values.round(3)

## Standardixing Data


In [14]:
from sklearn.preprocessing import StandardScaler

In [15]:
scaler = StandardScaler()

In [16]:
#scaler.fit(x)
#x_scaled = scaler.transform(x)

### Predicting with standardized data

In [17]:
#new_data_scaled = scaler.transform(new_data)
#reg.predict(new_data_scaled)

## Train Test Split

In [18]:
from sklearn.model_selection import train_test_split
a = np.arange(1, 101)
b = np.arange(1, 600)
a_train, a_test = train_test_split(a, test_size=0.2, shuffle=False, random_state=42)

In [19]:
a_train

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
       52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68,
       69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80])

In [20]:
a_test

array([ 81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,
        94,  95,  96,  97,  98,  99, 100])

## Plotting with Seaborn

In [21]:
#seaborn.distplot(data_no_mv["category"])

## Dealing with outliers

In [22]:
#remove with percentile
#q = data["category"].quantile(0.99)
#data_1 = data[data["category"] < q]

#re-indexing
#data_cleaned = data.reset_index(drop=True)

## Log Transformation

In [24]:
#data_log = np.log(data["category"])

## Calculating Multicolinearity

In [25]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [26]:
#variables = data[["mileage", "year", "EnginV"]]
#vif = pd.DataFrame()
#vif["VIF"] = [variance_inflation_factor(variables.values, i) for i in range(variables.shape[1])]
#vif["features"] = variables.columns

## Create dummy variables for categorical data automatically

If there are n dummies, there are n-1 dummy variables

In [3]:
#data_with_dummies = pd.get_dummies(data, drop_first=True)

## Logistic Regression

In [5]:
# x = sm.add_constant(x1)
# reg_log = sm.Logit(y, x)
# results_log = reg_log.fit()

## Logistic Accuracy

In [1]:
# results_log.pred_table()