# Putting Some Pandas In Your Python

Command: <code>pip install pandas</code>

![title](image/1.jpg)

In [1]:
! pip install pandas



In [1]:
import pandas as pd
import numpy as np

## Pandas Series Data Structure

In [2]:
# pd.Series(data,index)
# index-> Unique, Hashable, same length as data. By default np.arange(n)

s = pd.Series([1, 2, 3, 4])

print(s)

0    1
1    2
2    3
3    4
dtype: int64


In [3]:
print(s[2])

3


In [4]:
print(s[1:3])

1    2
2    3
dtype: int64


In [5]:
s = pd.Series(['x', 'y', 'z', 'abc'])

print(s)

0      x
1      y
2      z
3    abc
dtype: object


In [6]:
s = pd.Series(['kanav', 'bansal'])

print(s)

0     kanav
1    bansal
dtype: object


### Creating Series from Numpy ndarray

In [5]:
data = np.array([10, 20, 30, 40, 50])

s = pd.Series(data)

print(s)

0    10
1    20
2    30
3    40
4    50
dtype: int32


In [7]:
data = np.array([[1, 2, 3], [4, 5, 6]])

s = pd.Series(data)

print(s)

Exception: Data must be 1-dimensional

### Data accessing using Index

In [14]:
s = pd.Series([1, 2, 3, 4, 5])

print(s[2])

print(s[1:])

print(s[[1, 4]])

3
1    2
2    3
3    4
4    5
dtype: int64
1    2
4    5
dtype: int64


In [15]:
print(s[1:4])

1    2
2    3
3    4
dtype: int64


In [16]:
s = pd.Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e'])

print(s)

a    1
b    2
c    3
d    4
e    5
dtype: int64


In [17]:
print(s['a'])

1


In [18]:
print(s['a':])

a    1
b    2
c    3
d    4
e    5
dtype: int64


In [19]:
# Retrieve multiple elements

print(s[['a', 'b', 'e']])

a    1
b    2
e    5
dtype: int64


In [13]:
print(s['f'])

KeyError: 'f'

## Pandas DataFrame

### Creating DataFrame using Dictionary

In [16]:
data = {'Name':['Tom', 'Jack', 'Steve', 'Ricky'], 'Age':[28,34,29,42]}
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age
0,Tom,28
1,Jack,34
2,Steve,29
3,Ricky,42


In [20]:
# Creating indexed dataframe

data = {'Name':['Tom', 'Jack', 'Steve', 'Ricky'], 
        'Age':[28,34,29,42],
       'Gender':['Male', 'Female', 'Female', 'Male']}
df = pd.DataFrame(data, index=['I-1', 'I-2', 'I-3', 'I-4'])
print(df)

      Name  Age  Gender
I-1    Tom   28    Male
I-2   Jack   34  Female
I-3  Steve   29  Female
I-4  Ricky   42    Male


In [21]:
df = pd.DataFrame({
    'col-1': ['Item-1', 'Item-2', 'Item-3', 'Item-4'],
    'col-2': ['Gold', 'Bronze', 'Gold', 'Silver'],
    'col-3': [1, 2, np.nan, 4]
    })

print(df)

    col-1   col-2  col-3
0  Item-1    Gold    1.0
1  Item-2  Bronze    2.0
2  Item-3    Gold    NaN
3  Item-4  Silver    4.0


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   col-1   4 non-null      object 
 1   col-2   4 non-null      object 
 2   col-3   3 non-null      float64
dtypes: float64(1), object(2)
memory usage: 224.0+ bytes


### Creating DataFrame using Tuple

In [8]:
data = [('1/1/2019', 13, 6, 'Rain'),
       ('2/1/2019', 11, 7, 'Fog'),
       ('3/1/2019', 12, 8, 'Sunny'),
       ('4/1/2019', 8, 5, 'Snow'),
       ('5/1/2019', 9, 6, 'Rain')]
df = pd.DataFrame(data,
                  columns=['Day', 'Temperature', 'WindSpeed', 'Event'])

df

Unnamed: 0,Day,Temperature,WindSpeed,Event
0,1/1/2019,13,6,Rain
1,2/1/2019,11,7,Fog
2,3/1/2019,12,8,Sunny
3,4/1/2019,8,5,Snow
4,5/1/2019,9,6,Rain


## DataFrame Basic Functionality

In [24]:
# Create Dictionary of Series
dict = {'Name':pd.Series(['Tom', 'Jack', 'Steve', 'Ricky', 'Vin', 'James', 'Vin']),
       'Age':pd.Series([25,26,25,35,23,33,31]),
       'Rating':pd.Series([4.23,4.1,3.4,5,2.9,4.7,3.1])}

df = pd.DataFrame(dict)
df

Unnamed: 0,Name,Age,Rating
0,Tom,25,4.23
1,Jack,26,4.1
2,Steve,25,3.4
3,Ricky,35,5.0
4,Vin,23,2.9
5,James,33,4.7
6,Vin,31,3.1


In [25]:
# Prints columns name in the table

df.columns

Index(['Name', 'Age', 'Rating'], dtype='object')

In [26]:
# Transpose-> returns transpose of DataFrame
df.T

Unnamed: 0,0,1,2,3,4,5,6
Name,Tom,Jack,Steve,Ricky,Vin,James,Vin
Age,25,26,25,35,23,33,31
Rating,4.23,4.1,3.4,5,2.9,4.7,3.1


In [27]:
df

Unnamed: 0,Name,Age,Rating
0,Tom,25,4.23
1,Jack,26,4.1
2,Steve,25,3.4
3,Ricky,35,5.0
4,Vin,23,2.9
5,James,33,4.7
6,Vin,31,3.1


In [28]:
# dtypes-> return datatype of each column

df.dtypes

Name       object
Age         int64
Rating    float64
dtype: object

In [29]:
# shape-> returns tuple representing dimensionallity

df.shape

(7, 3)

In [30]:
# Axes-> returns list of row axis labels and column axis labels

df.axes

[RangeIndex(start=0, stop=7, step=1),
 Index(['Name', 'Age', 'Rating'], dtype='object')]

In [31]:
# Data types of each column

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Name    7 non-null      object 
 1   Age     7 non-null      int64  
 2   Rating  7 non-null      float64
dtypes: float64(1), int64(1), object(1)
memory usage: 296.0+ bytes


In [32]:
# values-> returns actual data as ndarray

df.values

array([['Tom', 25, 4.23],
       ['Jack', 26, 4.1],
       ['Steve', 25, 3.4],
       ['Ricky', 35, 5.0],
       ['Vin', 23, 2.9],
       ['James', 33, 4.7],
       ['Vin', 31, 3.1]], dtype=object)

In [33]:
# head-> by default head returns first 5 rows

df.head()

Unnamed: 0,Name,Age,Rating
0,Tom,25,4.23
1,Jack,26,4.1
2,Steve,25,3.4
3,Ricky,35,5.0
4,Vin,23,2.9


In [34]:
df.head(2)

Unnamed: 0,Name,Age,Rating
0,Tom,25,4.23
1,Jack,26,4.1


In [35]:
# tail-> by default tail returns last 5 rows

df.tail()

Unnamed: 0,Name,Age,Rating
2,Steve,25,3.4
3,Ricky,35,5.0
4,Vin,23,2.9
5,James,33,4.7
6,Vin,31,3.1


In [36]:
df.tail(2)

Unnamed: 0,Name,Age,Rating
5,James,33,4.7
6,Vin,31,3.1


## Statistics

In [37]:
df

Unnamed: 0,Name,Age,Rating
0,Tom,25,4.23
1,Jack,26,4.1
2,Steve,25,3.4
3,Ricky,35,5.0
4,Vin,23,2.9
5,James,33,4.7
6,Vin,31,3.1


In [38]:
# sum()-> returns the sum of values for requested axis. by default axis = 0

df.sum()

Name      TomJackSteveRickyVinJamesVin
Age                                198
Rating                           27.43
dtype: object

In [39]:
# axis = 1 -> row wise sum

print(df.sum(1))

0    29.23
1    30.10
2    28.40
3    40.00
4    25.90
5    37.70
6    34.10
dtype: float64


In [40]:
# mean()

print(df.mean())

Age       28.285714
Rating     3.918571
dtype: float64


In [41]:
# std()

print(df.std())

Age       4.644505
Rating    0.804828
dtype: float64


In [42]:
# describe() -> summarizing the data

print(df.describe())

             Age    Rating
count   7.000000  7.000000
mean   28.285714  3.918571
std     4.644505  0.804828
min    23.000000  2.900000
25%    25.000000  3.250000
50%    26.000000  4.100000
75%    32.000000  4.465000
max    35.000000  5.000000


In [43]:
# include object, number, all

print(df.describe(include=['object']))

       Name
count     7
unique    6
top     Vin
freq      2


In [33]:
print(df.describe(include=['number']))

             Age    Rating
count   7.000000  7.000000
mean   28.285714  3.918571
std     4.644505  0.804828
min    23.000000  2.900000
25%    25.000000  3.250000
50%    26.000000  4.100000
75%    32.000000  4.465000
max    35.000000  5.000000


In [44]:
# Don't pass 'all' as a list

print(df.describe(include='all'))

       Name        Age    Rating
count     7   7.000000  7.000000
unique    6        NaN       NaN
top     Vin        NaN       NaN
freq      2        NaN       NaN
mean    NaN  28.285714  3.918571
std     NaN   4.644505  0.804828
min     NaN  23.000000  2.900000
25%     NaN  25.000000  3.250000
50%     NaN  26.000000  4.100000
75%     NaN  32.000000  4.465000
max     NaN  35.000000  5.000000


## Working with .csv

### Iris Dataset

In [24]:
df = pd.read_csv('data/Iris.csv')

df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [3]:
df.tail()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica
149,150,5.9,3.0,5.1,1.8,Iris-virginica


In [6]:
print(df.shape)

(150, 6)


In [8]:
print(df.columns)

Index(['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm',
       'Species'],
      dtype='object')


In [10]:
print(df.mean())

Id               75.500000
SepalLengthCm     5.843333
SepalWidthCm      3.054000
PetalLengthCm     3.758667
PetalWidthCm      1.198667
dtype: float64


In [11]:
print(df.std())

Id               43.445368
SepalLengthCm     0.828066
SepalWidthCm      0.433594
PetalLengthCm     1.764420
PetalWidthCm      0.763161
dtype: float64


In [9]:
df.describe()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
count,150.0,150.0,150.0,150.0,150.0
mean,75.5,5.843333,3.054,3.758667,1.198667
std,43.445368,0.828066,0.433594,1.76442,0.763161
min,1.0,4.3,2.0,1.0,0.1
25%,38.25,5.1,2.8,1.6,0.3
50%,75.5,5.8,3.0,4.35,1.3
75%,112.75,6.4,3.3,5.1,1.8
max,150.0,7.9,4.4,6.9,2.5


In [26]:
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [27]:
df.describe(include=['object'])

Unnamed: 0,Species
count,150
unique,3
top,Iris-setosa
freq,50


### Weather Dataset

In [7]:
import pandas as pd

df = pd.read_csv('data/nyc_weather.csv')

df.head()

Unnamed: 0,EST,Temperature,DewPoint,Humidity,Sea Level PressureIn,VisibilityMiles,WindSpeedMPH,PrecipitationIn,CloudCover,Events,WindDirDegrees
0,1/1/2016,38,23,52,30.03,10,8.0,0,5,,281
1,1/2/2016,36,18,46,30.02,10,7.0,0,3,,275
2,1/3/2016,40,21,47,29.86,10,8.0,0,1,,277
3,1/4/2016,25,9,44,30.05,10,9.0,0,3,,345
4,1/5/2016,20,-3,41,30.57,10,5.0,0,0,,333


In [9]:
# What is the maximum temperature?

df['Temperature'].max()

50

In [12]:
# Select the row with maximum Temperature

df[df.Temperature == df.Temperature.max()]

Unnamed: 0,EST,Temperature,DewPoint,Humidity,Sea Level PressureIn,VisibilityMiles,WindSpeedMPH,PrecipitationIn,CloudCover,Events,WindDirDegrees
9,1/10/2016,50,46,71,29.59,4,,1.8,7,Rain,109


In [10]:
# Which day it rains?

df['EST'][df['Events'] == 'Rain']

8      1/9/2016
9     1/10/2016
15    1/16/2016
26    1/27/2016
Name: EST, dtype: object

In [13]:
# Select the day with maximum Temperature

df.EST[df.Temperature == df.Temperature.max()]

9    1/10/2016
Name: EST, dtype: object

In [11]:
# Average wind speed

df['WindSpeedMPH'].mean()

6.892857142857143

## Working with .xlsx

Command_1: <code>pip3 install xlrd</code><br /><br />
Command_2: <code>pip3 install openpyxl</code>

In [1]:
import pandas as pd

df = pd.read_excel('data/weather_data.xlsx')

df.head()

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32,6,Rain
1,1/2/2017,35,7,Sunny
2,1/3/2017,28,2,Snow
3,1/4/2017,24,7,Snow
4,1/5/2017,32,4,Rain


## Dataframe to .csv & .xlsx

In [29]:
import pandas as pd
import numpy as np

# Create Dictionary of Series
dict = {'Name':pd.Series(['Tom', 'Jack', 'Steve', 'Ricky', 'Vin', 'James', 'Smith']),
       'Age':pd.Series([25,26,25,35,23,33,31]),
       'Rating':pd.Series([4.23,4.1,3.4,5,2.9,4.7,3.1])}

df = pd.DataFrame(dict)
print(df)

    Name  Age  Rating
0    Tom   25    4.23
1   Jack   26    4.10
2  Steve   25    3.40
3  Ricky   35    5.00
4    Vin   23    2.90
5  James   33    4.70
6  Smith   31    3.10


In [30]:
# Write Dataframe to CSV

df.to_csv('data/temp/new_csv_file.csv')

In [31]:
# Write Dataframe to CSV without index

df.to_csv('data/temp/new_csv_file_1.csv', index=False)

In [32]:
# Write Dataframe to XLSX

df.to_excel('data/temp/new_excel_file.xlsx', sheet_name='stud_data')

In [33]:
# Write Dataframe to XLSX without index

df.to_excel('data/temp/new_excel_file_noIndex.xlsx', sheet_name='stud_data', index=False)