**IMPORTING ALL THE USEFULL LIBRARIES**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
%matplotlib inline
import seaborn as sb
import sklearn
from pylab import rcParamsOrig
from scipy.stats import ttest_ind
from scipy.stats import ttest_1samp
from scipy.stats import chi2_contingency
from sklearn.utils import shuffle
from sklearn.linear_model import LinearRegression as LR
from sklearn.metrics import mean_absolute_error as mae
from sklearn.preprocessing import scale
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from collections import Counter

**CLEANING OF DATA**

In [None]:
fd=pd.read_csv('chennai_house_price_prediction.csv')   #READING THE CSV FILE
fd.shape                               #TO SEE ALL NO. OF COLUMN AND ROW
fd.head(10)  
fd.tail(10)                            #TO PRINT FIRST5 ROWS WITH COLUMN                
fd.columns                             #TO PRINT NAMES OF ALL THE COLUMNS
fd.dtypes                              #TO CHECK FOR THE VARIABLE
fd.isnull().sum()                      #TO CHECK THE MISSING VALUES
fd.fillna(0,inplace=True)              #INTIALIZING 0 TO ALL MISSING VALUES
fd.isnull().sum()                      #TO CHECK WHETHER THERE IS ANY NULL VALUE IN EACH COLUMN
fd.dtypes 

**INVESTIGATION BETWEEN CATEGORICAL VARIABLES**

In [None]:
#2 or more categorical variables
pd.crosstab(fd['PARK_FACIL'],fd['BUILDTYPE'])
chi2_contingency(pd.crosstab(fd['PARK_FACIL'],fd['BUILDTYPE'])),pd.crosstab(fd['PARK_FACIL'],fd['BUILDTYPE']).plot.bar()


**COMBINATION OF CATEGORICAL AND CONTINUOUS VARIABLES**

In [None]:
#categorical and contiunuous
fd.groupby('PARK_FACIL')['SALES_PRICE'].mean()
fd.groupby('PARK_FACIL')['SALES_PRICE'].mean().plot.bar()

**T TEST**

In [None]:
#T test
yes=fd[fd['PARK_FACIL']=='Yes']
no=fd[fd['PARK_FACIL']=='No']
a=yes.mean()
b=no.mean()
_,p_value=ttest_ind(yes['SALES_PRICE'],no['SALES_PRICE'],nan_policy='omit')
print("P VALUE IS : ",p_value)
if p_value < 0.05:
  print('REJECTING NULL HYPOTHESIS')
else:
  print('ACCEPTING NULL HYPOTHESIS')

**HYPOTHESIS**

This is the datset in which we have to predict the sales price of the house.
So we have to wisely select the variables on which we have perform the operations.
Some variables should be representing number of bedrom,bathroom,the locality where the house is located and many such variables.
In this we also study about the means of the groups.

**NULL HYPOTHESIS**

The hypothesis that says there is no statistical significance between the two variables.If the means of two groups is same then it is null hypothesis otherwise it is alternate hypothesis

In [None]:
area =pd.get_dummies(fd.AREA)          #CREATION OF DUMMY FOR THE COLUMNS WHICH ARE CATEGORICAL 
salecond =pd.get_dummies(fd.SALE_COND)
parking =pd.get_dummies(fd.PARK_FACIL)
buildtype =pd.get_dummies(fd.BUILDTYPE)
utility =pd.get_dummies(fd.UTILITY_AVAIL)
street =pd.get_dummies(fd.STREET)
mzzone =pd.get_dummies(fd.MZZONE)
area.head()
fd1=fd.drop(['AREA','SALE_COND','PARK_FACIL','BUILDTYPE','UTILITY_AVAIL','STREET','MZZONE'],axis='columns')
fd2=pd.concat([fd1,area,salecond,parking,buildtype,utility,street,mzzone],axis='columns')

In [None]:
fd2.head()
fd2.drop(['PRT_ID'],axis='columns',inplace=True)

**LINEAR REGRESSION ANALYASIS**

In [None]:
#LINEAR 
x=fd2.drop(['SALES_PRICE'], axis=1)
y=fd2['SALES_PRICE']
x.shape,y.shape
train_x,test_x,train_y,test_y=train_test_split(x,y,test_size=0.2,random_state =46) #TO DIFFERENTIATE THE TEST DATA AND TRAIN DATA
lr=LR()
lr.fit(train_x,train_y)   
lr.score(test_x,test_y)
cv=ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
cross_val_score(lr,x,y,cv=cv)

In [None]:
#prediction over train set and calculating error 
train_predict=lr.predict(train_x)
k=mae(train_predict,train_y)
print('TRAINING MEAN ABSOLUTE ERROR',k)

In [None]:
#prediction over test set and calculating error 
test_predict=lr.predict(test_x)
k=mae(test_predict,test_y)
print('TESTING MEAN ABSOLUTE ERROR',k)

In [None]:
lr.score(train_x,train_y),lr.score(test_x,test_y)

In [None]:
def predict_price(area,sqft,bath,bhk):
  loc_index=np.where(x.columns==area)[0][0]
  X=np.zeros(len(x.columns))
  X[0]=sqft
  X[1]=bath
  X[2]=bhk
  if loc_index >=0:
    X[loc_index]=1

  return lr.predict([X])[0]


In [None]:
print(predict_price('Velachery',1220,2,3))
print(predict_price('Adyar',1340,3,2))
print(predict_price('Adyar',2222,2,4))

**SCATTER PLOT FOR REGRESSION AND CORRELATION**

In [None]:
#CORRELATION SCATTER PLOT
a=fd.corr(method='pearson')         #correlation matrix
#sb.pairplot(a)
fd.plot.scatter('SALES_PRICE','INT_SQFT')

In [None]:
#REGRESSION SCATTER PLOT
x=fd.sample(200)
ax=x['SALES_PRICE']
ay=x['INT_SQFT']
sb.regplot(ax,ay,color='r',data=fd)

**ANOVA TEST**

In [None]:
import statsmodels.api as sm
from statsmodels.formula.api import ols
data=fd.values
lm=ols('SALES_PRICE~STREET',data=fd).fit()
sm.stats.anova_lm(lm)

**BAR CHART FOR ANOVA**

In [None]:
sm.stats.anova_lm(lm).plot.bar()