
## Reading data from python

# pandas documentation http://pandas.pydata.org/

In [2]:
import pandas as pd

# dataset doesn't have headers
df = pd.read_csv('url', header = None)
df = pd.read_json('url', header = None)
df = pd.read_excel('url', header = None)
df = pd.read_sql('url', header = None)
# defining header name
headers = ['sex', 'age',..., 'price']

# assigning names to columns headers
df.columns = headers

# OR

df = pd.read_csv('url', names = headers)

# Exporting dataset 

In [None]:
df.to_csv('path')
df.to_csv('path')
df.to_csv('path')
df.to_csv('path')

# Getting Starting with Data Analysis in Python

# panda data types:

###  object (strings)
###  int64  (numeric characters)
###  float64 (numerica characters with decimals)
###  datetime64,timedelta[ns] -- timeseries type

##  python native data types:

###  string
###  int
###  float
###  N/A (i.e. no timeseries data type in native python)

In [None]:
## data type

df.dtypes

In [None]:
## Statistical Summary

df.describe()  # only numerical columns
dff.describe(include = 'all') # ALL columns

In [None]:
## Concise Summary of Data  - top and bottom 30 rows of the dataframe

df.info()


# Changing the values in specific column

In [5]:
df['columnname'] = df['columnname'] + 1

NameError: name 'df' is not defined

# Handling/replacing missing values in python

### First: Evaluating for missing values

missing_data = df.isnull()

Non-missing data = df.notnull()

missing_data.head(5)

## Count missing values in each column!!!

In [None]:
for column in missing_data.columns.values.tolist():
    print(column)
    print(missing_data[column].value_counts()) # .value_counts(): counts the number of True's
    print("")

# When dropping values from a column, INDEX should be reset!

In [None]:
# dropping rows in 'price'

df.dropna(subset=['price'], axis = 0, inplace = True)

# reset index
df.reset_index(drop=True, inplace=True)



In [None]:
df.dropna() # axis = 0 to drop ROW, axis = 1 to drop COLUMN

In [None]:
df.dropna(subset = ['price'], axis = 0, inplace = True)

# inplace = True : changes will happen in the datafram directly

df.dropna(subset = ['price'], axis = 0) # DOES NOT change the dataframe

In [None]:
## Replacing Missing values with mean

import numpy as np
df.replace(missing_value, new_value)

In [None]:
### replacing with mean of a column
import numpy as np

mean = df['columnname'].mean()
df['columnname'].replace(np.nan, mean)

# Example of replacing NaN with mean 

avg = df['columnname'].astype('float').mean(axis=0)
df['columnname'].replace(np.nan, avg, inplace=True)

In [None]:
## Replacing values in the ENTIRE dataset

import numpy as np

df.replace('?', np.nan, inplace = True)


# Counting the frequency of different values in a feature



In [None]:
df['columnname'].value_counts() # gives count of distinct values

## Getting the most frequent value in a feature

df['columnname'].value_counts().idxmax()

# Data Formatting or Data standardization - bringining data into a common format

#### Data Formatting makes data more understandable. Makes statistical analyses easier. e.g. writing NY, N.Y, New York instead of just 'New York' OR having to convert mpg (miles per gallon) to metric units (Liter per Km) 

In [None]:
# mpg to l/km
df['mpg'] = 235/df['mpg']

#renaming columns

df.rename(columns={'mpg': 'L/Km'}, inplace = True))

# Converting to correct data types

In [None]:
#checking datatype

df.dtypes()


# converting data types

df.astype()

# e.g.

df['price'] = df['price'].astype('int')

df[['col1', 'col2',...]] = df[['col1', 'col2',...]].astype('int')

# Data Normalization in Python (important to data pre-processing)


#### data normalization makes the RANGE OF VALUES consistent. This makes statistical analysis easier down the road. it enables a fair comparison between features.

# Why is Data Normalization important?

##### Take the comparison of 'age' and 'income' columns in a dataset. 'Age' ranges from 0 -100 but 'income' might range from 20,000 to 500,000. If we're running Linear Regression, for example, the wider range of values for the 'income' column impacts the output of the model more heavily, even though it is not necessarily a more important variable than 'age'.

# Ways to normalize value:

In [None]:
# Method 1: Simple Feature Scaling

Xnew = Xold / Xmax

# e.g.

df['length'] = df['length']/df['length'].max()

# Method 2: Min-Max method

Xnew = (Xold - Xmin)/(Xmax - Xmin)

df['length'] = (df['length'] - df['length'].min) / (df['length'].max() - df['length'].min())

# e.g.

# Method 3: Z-Score

Xnew = (Xold - avg(X)) / Std(X)

# e.g.

df['length'] = (df['length'] - df['length'].mean()) / df['length'].std()



# Binning - Values into groups
 #### Binning can help us better understand the distribution of values of a numerical feature.i.e. price would be bins = low, medium, high
 
### Creating 3 bins based on the 'price' column


In [None]:
import numpy as np

# step 1: saving 4 equally spaced values within the specified range within the feature

bins = np.linspace(min(df['price']), max(df['price']), 4)

# step 2: bin names

group_names = ['low', 'medium', 'high']

# step 3 : creating the 'price-binned' column

df['price-binned'] = pd.cut(df['price'], bins, labels = group_names, include_lowest = True)

# Converting Categorical into Numerical variables

### We do this because most statistical models cannot take in object or strings as input

## Method: One-hot encoding OR Creating dummy variables ( assigning 0 or 1 in each category)



In [None]:
## creating dummies

dummy_fuel = pd.get_dummies(df['fuel'])

### Merging Dummy variables with dataset and dropping original column

In [None]:
df = pd.concat([df, dummy_fuel], axis = 1)



## Dropping original variable

df.drop('fuel', axis = 1, inplace = True)

# Histogram

In [None]:
%matplotlib inline
import matplotlib as plt
from matplotlib import pyplot

# histograph without specifying bins
plt.pyplot.hist(df['colname'])

# histogram with specifying bins
plt.pyplot.hist(df['colname'], bins = 3)

# Setting x & y labels

plt.pyplot.xlabel('horsepower')
plt.pyplot.ylabel('count')
plt.pyplot.title('horsepower bins')

# Bar Plot

In [None]:
%matplotlib inline
import matplotlib as plt
from matplotlib import pyplot
pyplot.bar(category_names, df['colname'].value_counts())


# Setting x & y labels

plt.pyplot.xlabel('horsepower')
plt.pyplot.ylabel('count')
plt.pyplot.title('horsepower bins')

# Descriptive Statistics
#### Statistically summarizing data to get to know the data better.

## Boxplot

#### Boxplots are great for visualizing the distribution of numeric data

In [None]:
sns.boxplot(x = "colname", y= "colname", data = df)

In [1]:
# Scatterplot

#### Scatterplot allows us to example the relationship between the predictor (x-axis) and the target (y-axis) variable.)

In [None]:
y = df["colname"]
x = df["colname"]
sns.scatter(x,y)


# axis titles

plt.title("plot title")
plt.xlabel("x-axis title")
plt.ylable("y-axis title")

# Grouping Data in Python - .Groupby()

#### 1. Groupby method is used on categorical variables.
#### 2. .Groupby() groups data into subsets according to the categories of the variable
#### 3. Groupby() can group by a single or multiple variables

In [None]:
# FOR EXAMPLE: Examning the average "Price" across different "drive-wheels" and "body-style" categories.

# Step 1: Creating a dataset that consists of all columns we want to look at

df_test = df[['drive-wheels', 'body-style', 'price']]

# Step 2: Examining 'average price' for various categories in # wheels and different body-styles
df_group = df_test.groupby(['drive-wheels', 'body-style'], as_index=False).mean()
df_group

In [None]:
### How to pivot a table in Python: Exchanging rows and columns with one another

df_pivot = df_group.pivot(index = 'variablename', columns = 'variablename')

https://www.youtube.com/watch?v=q9ColmygT0s


# Heatmap

#### Plots target variable over multiple variables (and various categories within these variables) in a visual way

#### Heathmaps are most suitable for pivot tables in python!

plt.pcolor(df_pivot, cmap = 'RdBu')
plt.colorbar()
plt.show()


# Correlation

## A statistical metric to measure to what extent two variables are INTERDEPENDENT. In other words, how does change in one impact the other.

### Example: Correlation between 'engine size' AND  'price'

In [None]:
### scatter plot with regression line

sns.regplot(x='engine-size', y='price', data=df)
plt.ylim(0,)

# Pearson Correlation

### A method to examine the correlation between continuous variables.
####Pearson correlation gives us: Correlation Coefficients AND p-value

In [None]:
# Correlation

# Step 1: 

import scipy.stats as stats

# Step 2:

pearson_coef, p_value = stats.pearsonr(df['predictor_variable'], df['target_variable'])



# ANOVA - Analysis of Variance

### Used for analyzing Categorical Variables. ANOVA can be used to find correlation between different groups of a categorical variable. 

#### For example, we can use ANOVA to see if there is any difference in the impact of different car makes on 'Price'. 

### ANOVA returns two values: 
#### F-test score: The variation between "means of categories" and divided by variation of each category.
#### p-value: statistical significane degree

#### Large F-test score and Small p-value MEANS : There is a strong correlation between a Categorical variable and a target variable.

https://www.youtube.com/watch?v=wMQ5oVuXK7o

In [None]:
# ANOVA using the scipy mpackage

# For exmple: relationship between the variaous 'car make' categories and 'price'

import scipy as stats

df_anova = df[["make", "price"]]
grouped_anova = df_anova.groupby(["make"])


anova_results_1 = stats.f_oneway(grouped_anova.get_group("honda")["price"],grouped_anova.get_group("subaru")["price"])


# Fitting a Simple Linear Regression Model

In [1]:
# Step 1:

from sklearn.linear_model import LinearRegression

# Step 2: Create a Linear Regression Object using the constructor

lm = LinearRegression()


# Step 3: We define the predictor variable and the target variable

X = df[['highway-mpg']]
Y = df['price']
    
    

NameError: name 'df' is not defined