# Pandas

Pandas is an open-source, high-performance, easy-to-use library for data structures and data analysis tools for the Python programming language. Pandas adds data structures and tools designed to work with table-like data, primarily using Series and DataFrames. 

Key data manipulation tasks include reshaping, merging, sorting, slicing, aggregation, and imputation.

## 1.  Installation

In [None]:
# For mac 
pip install conda
conda install pandas

# For Window
pip install conda
pip install pandas


## 2. Pandas Data Structures

### 1. Creating Pandas Series

In [1]:
# using a list

import pandas as pd
import numpy as np

nums = [1, 2, 3, 4, 5]
s = pd.Series(nums)
print(s)


0    1
1    2
2    3
3    4
4    5
dtype: int64


In [2]:
# With Custom Index

nums = [1, 2, 3, 4, 5]
s = pd.Series(nums, index=[1, 2, 3, 4, 5])
print(s)

1    1
2    2
3    3
4    4
5    5
dtype: int64


In [3]:
# From a Dictionary

dct = {'name':'Suniksha','country':'Finland','city':'Helsinki'}
s = pd.Series(dct)
print(s)

name       Suniksha
country     Finland
city       Helsinki
dtype: object


In [4]:
# Creating a Constant Series

s = pd.Series(10, index=[1, 2, 3])
print(s)

1    10
2    10
3    10
dtype: int64


In [5]:
# Using Linspace

s = pd.Series(np.linspace(5, 20, 10))
print(s)

0     5.000000
1     6.666667
2     8.333333
3    10.000000
4    11.666667
5    13.333333
6    15.000000
7    16.666667
8    18.333333
9    20.000000
dtype: float64


### 2. Creating Pandas Data - Frame

In [6]:
# From a List of Lists

data = [
    ['Avadhesh', 'Finland', 'Helsinki'], 
    ['David', 'UK', 'London'],
    ['John', 'Sweden', 'Stockholm']
]
df = pd.DataFrame(data, columns=['Names', 'Country', 'City'])
print(df)

      Names  Country       City
0  Avadhesh  Finland   Helsinki
1     David       UK     London
2      John   Sweden  Stockholm


In [8]:
# From a Dictionary

data = {'Name': ['Avadhesh', 'David', 'John'], 'Country': ['Finland', 'UK', 'Sweden'], 'City': ['Helsinki', 'London', 'Stockholm']}
df = pd.DataFrame(data)
print(df)

       Name  Country       City
0  Avadhesh  Finland   Helsinki
1     David       UK     London
2      John   Sweden  Stockholm


In [10]:
# From a List of Dictionaries

data = [
    {'Name': 'Avadhesh', 'Country': 'Finland', 'City': 'Helsinki'},
    {'Name': 'David', 'Country': 'UK', 'City': 'London'},
    {'Name': 'John', 'Country': 'Sweden', 'City': 'Stockholm'}
]
df = pd.DataFrame(data)
print(df)

       Name  Country       City
0  Avadhesh  Finland   Helsinki
1     David       UK     London
2      John   Sweden  Stockholm


## 3. Reading CSV file

In [None]:
df = pd.read_csv('weight-height.csv')
print(df)


### 3.1  Data Exploration

In [None]:

print(df.head()) # First 5 rows

print(df.tail()) # Last 5 rows

print(df.shape)  # shape of the data Frame

print(df.columns) # column name

heights = df['Height'] # Accessing the specific column
print(heights)

print(heights.describe()) ## Stastical summary
print(weights.describe())
print(df.describe())

print(df.info()) # Data Information


 

### 3.2 Modifying a Data Frame

In [12]:
# Adding Columns

weights = [74, 78, 69]
df['Weight'] = weights
heights = [173, 175, 169]
df['Height'] = heights
print(df)

       Name  Country       City  Weight  Height
0  Avadhesh  Finland   Helsinki      74     173
1     David       UK     London      78     175
2      John   Sweden  Stockholm      69     169


In [13]:
# Modifying Column Values

df['Height'] = df['Height'] * 0.01
print(df)

       Name  Country       City  Weight  Height
0  Avadhesh  Finland   Helsinki      74    1.73
1     David       UK     London      78    1.75
2      John   Sweden  Stockholm      69    1.69


In [14]:
# Calculating and Adding BMI Column

def calculate_bmi():
    weights = df['Weight']
    heights = df['Height']
    bmi = []
    for w, h in zip(weights, heights):
        b = w / (h * h)
        bmi.append(b)
    return bmi

bmi = calculate_bmi()
df['BMI'] = bmi
df['BMI'] = round(df['BMI'], 1)
print(df) 

       Name  Country       City  Weight  Height   BMI
0  Avadhesh  Finland   Helsinki      74    1.73  24.7
1     David       UK     London      78    1.75  25.5
2      John   Sweden  Stockholm      69    1.69  24.2


In [15]:
# Adding Birth Year and Current Year Columns

birth_year = ['1769', '1985', '1990']
current_year = pd.Series(2020, index=[0, 1, 2])

df['Birth Year'] = birth_year
df['Current Year'] = current_year

print(df)

       Name  Country       City  Weight  Height   BMI Birth Year  Current Year
0  Avadhesh  Finland   Helsinki      74    1.73  24.7       1769          2020
1     David       UK     London      78    1.75  25.5       1985          2020
2      John   Sweden  Stockholm      69    1.69  24.2       1990          2020


In [16]:
# Checking Data Types

print(df['Weight'].dtype)

df['Birth Year'] = df['Birth Year'].astype('int')
print(df['Birth Year'].dtype)

df['Current Year'] = df['Current Year'].astype('int')
print(df['Current Year'].dtype)

int64
int32
int32


In [17]:
# Calculating Age

ages = df['Current Year'] - df['Birth Year']
df['Ages'] = ages
print(df)

       Name  Country       City  Weight  Height   BMI  Birth Year  \
0  Avadhesh  Finland   Helsinki      74    1.73  24.7        1769   
1     David       UK     London      78    1.75  25.5        1985   
2      John   Sweden  Stockholm      69    1.69  24.2        1990   

   Current Year  Ages  
0          2020   251  
1          2020    35  
2          2020    30  


In [18]:
# Handling Outliers

mean = (35 + 30) / 2
print('Mean:', mean)
print(df[df['Ages'] > 120])
print(df[df['Ages'] < 120])

Mean: 32.5
       Name  Country      City  Weight  Height   BMI  Birth Year  \
0  Avadhesh  Finland  Helsinki      74    1.73  24.7        1769   

   Current Year  Ages  
0          2020   251  
    Name Country       City  Weight  Height   BMI  Birth Year  Current Year  \
1  David      UK     London      78    1.75  25.5        1985          2020   
2   John  Sweden  Stockholm      69    1.69  24.2        1990          2020   

   Ages  
1    35  
2    30  


## Congratulations , We Completed