# **Pandas**
### Python's most widely used Data Analysis and Manipulation Tool
- Fast, Powerful, Flexible, Easy to Use and Open Source.
- Ease of data handling like missing values, type conversions, organising, etc.
- Widely compatible with other resources, making it an excellent choice for ML algorithms.
<hr>

Importing Library

In [2]:
import pandas as pd

Series

In [3]:
ele = [6, 1, 9]
idx = ['a', 'v', 'i']

print(pd.Series(ele))
print()
print(pd.Series(ele, index=idx))
print() 
print(pd.Series({'ur': 'YaMi', 'mom': 'GuSe', 'gey': 'ViPa'}))
print()
print(pd.Series(69, index=[1, 2, 3]))

0    6
1    1
2    9
dtype: int64

a    6
v    1
i    9
dtype: int64

ur     YaMi
mom    GuSe
gey    ViPa
dtype: object

1    69
2    69
3    69
dtype: int64


DataFrame

In [4]:
print(pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}))
print()
print(pd.DataFrame({'g': pd.Series([7, 2, 1, 1, 4]), 's': pd.Series([1, 0, 1, 1, 4])}))
print()
print(pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=['A', 'B', 'C'], index=['a', 'b', 'c']))

   A  B  C
0  1  4  7
1  2  5  8
2  3  6  9

   g  s
0  7  1
1  2  0
2  1  1
3  1  1
4  4  4

   A  B  C
a  1  2  3
b  4  5  6
c  7  8  9


Reading / Writing from External File

In [5]:
dat = {'Name': ['GuSe', 'YaMi', 'ViPa'], 'Age': [19, 15, 18], 'City': ['DL', 'HY', 'UP']}
frame = pd.DataFrame(dat)

frame.to_csv('Data/People.csv', index=False)

print(pd.read_csv('Data/People.csv'))
print()
print(pd.read_csv('Data/People.csv', nrows=2))
print()
print(pd.read_csv('Data/People.csv', usecols=["Name", "City"]))
print()
print(pd.read_csv('Data/People.csv', skiprows=[1, 3]))

   Name  Age City
0  GuSe   19   DL
1  YaMi   15   HY
2  ViPa   18   UP

   Name  Age City
0  GuSe   19   DL
1  YaMi   15   HY

   Name City
0  GuSe   DL
1  YaMi   HY
2  ViPa   UP

   Name  Age City
0  YaMi   15   HY


Reading from a URL

In [6]:
data = pd.read_table("http://bit.ly/imdbratings", sep=",")                      # Default sep is "\t"
print(data.head())

data = pd.read_table("http://bit.ly/movieusers", sep="|", header=None)          # No header
print(data.head())

   star_rating                     title content_rating   genre  duration  \
0          9.3  The Shawshank Redemption              R   Crime       142   
1          9.2             The Godfather              R   Crime       175   
2          9.1    The Godfather: Part II              R   Crime       200   
3          9.0           The Dark Knight          PG-13  Action       152   
4          8.9              Pulp Fiction              R   Crime       154   

                                         actors_list  
0  [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt...  
1    [u'Marlon Brando', u'Al Pacino', u'James Caan']  
2  [u'Al Pacino', u'Robert De Niro', u'Robert Duv...  
3  [u'Christian Bale', u'Heath Ledger', u'Aaron E...  
4  [u'John Travolta', u'Uma Thurman', u'Samuel L....  
   0   1  2           3      4
0  1  24  M  technician  85711
1  2  53  F       other  94043
2  3  23  M      writer  32067
3  4  24  M  technician  43537
4  5  33  F       other  15213


Basic Properties

In [34]:
data = pd.read_csv("http://bit.ly/uforeports")
print(data.head(3))                                                             # Use tail() for last 5 records
print()

print("Shape of the data :", data.shape)
print("Index values :", data.index)
print("List of column names :", data.columns)
print("Number of unique values :", data["Shape Reported"].nunique())
print(data["Shape Reported"].unique())

print()
print(data["Shape Reported"].value_counts().head(3))
print()
print(data["Shape Reported"].value_counts(normalize=True).head(3))              # Percentage/100 of unique values


          City Colors Reported Shape Reported State             Time
0       Ithaca             NaN       TRIANGLE    NY   6/1/1930 22:00
1  Willingboro             NaN          OTHER    NJ  6/30/1930 20:00
2      Holyoke             NaN           OVAL    CO  2/15/1931 14:00

Shape of the data : (18241, 5)
Index values : RangeIndex(start=0, stop=18241, step=1)
List of column names : Index(['City', 'Colors Reported', 'Shape Reported', 'State', 'Time'], dtype='object')
Number of unique values : 27
['TRIANGLE' 'OTHER' 'OVAL' 'DISK' 'LIGHT' 'CIRCLE' 'CIGAR' 'CYLINDER'
 'FIREBALL' 'SPHERE' nan 'RECTANGLE' 'FORMATION' 'FLASH' 'CHEVRON' 'EGG'
 'CONE' 'DIAMOND' 'VARIOUS' 'TEARDROP' 'CROSS' 'DELTA' 'ROUND' 'DOME'
 'PYRAMID' 'CRESCENT' 'FLARE' 'HEXAGON']

Shape Reported
LIGHT       2803
DISK        2122
TRIANGLE    1889
Name: count, dtype: int64

Shape Reported
LIGHT       0.179714
DISK        0.136052
TRIANGLE    0.121113
Name: proportion, dtype: float64


Accessing Elements

In [8]:
s = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
print(s.loc['c'])                                                                   # Same as s['c']

d = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=['A', 'B', 'C'], index=['a', 'b', 'c'])
print(d['A'])
print(d.loc['a'])                                                                   # Same as d.iloc[0]
print(d.iloc[:, -1])

d.loc["b", "B"] = 0
print(d)

3
a    1
b    4
c    7
Name: A, dtype: int64
A    1
B    2
C    3
Name: a, dtype: int64
a    3
b    6
c    9
Name: C, dtype: int64
   A  B  C
a  1  2  3
b  4  0  6
c  7  8  9


Indexing

In [9]:
d = pd.DataFrame({'A': [1, 2, 3], 'C': [7, 8, 9]})
print(d)
print()

d.set_index('A', inplace=True)
print(d)
print()

d.index.name = None
print(d)
print()

d.reset_index(inplace=True)
print(d)

   A  C
0  1  7
1  2  8
2  3  9

   C
A   
1  7
2  8
3  9

   C
1  7
2  8
3  9

   index  C
0      1  7
1      2  8
2      3  9


Row / Column Manipulation

In [10]:
d = pd.DataFrame({'A': [1, 2, 3], 'C': [7, 8, 9]})

print(d.replace(1, 0))
print(d.replace({1: 0, 7: 5}))
print(d.replace([1, 2, 3], 5))

d.insert(1, 'B', [4, 5, 6])
print(d)

print(d.pop("B"))
print(d)

   A  C
0  0  7
1  2  8
2  3  9
   A  C
0  0  5
1  2  8
2  3  9
   A  C
0  5  7
1  5  8
2  5  9
   A  B  C
0  1  4  7
1  2  5  8
2  3  6  9
0    4
1    5
2    6
Name: B, dtype: int64
   A  C
0  1  7
1  2  8
2  3  9


Operations

In [11]:
a = pd.Series([1, 2, 3, 4, 5])
b = pd.Series([6, 9, 4, 2, 0])
c = pd.Series(25, index=[1, 2, 3, 4, 5])
print(a + b)                                                                      # Arithmetic operation
print(a + c)                                                                      # Missing indexes

e = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [4, 5, 6, 7], 'C': [7, 8, 9, 0]})
print(e["A"] + e["B"])                                                            # Column wise operation
print(e.iloc[0] + e.iloc[-1])                                                     # Row wise operations

f = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]})
f["B < 5"] = f["B"] < 5                                                           # Conditional operations
print(f)

0     7
1    11
2     7
3     6
4     5
dtype: int64
0     NaN
1    27.0
2    28.0
3    29.0
4    30.0
5     NaN
dtype: float64
0     5
1     7
2     9
3    11
dtype: int64
A     5
B    11
C     7
dtype: int64
   A  B  C  B < 5
0  1  4  7   True
1  2  5  8  False
2  3  6  9  False


Some Useful Functions

In [51]:
data = pd.read_csv("http://bit.ly/imdbratings")
data.drop("actors_list", axis=1, inplace=True)
print(data.head(3), end="\n\n")

data.drop(0, axis=0, inplace=True)
print(data.head(3), end="\n\n")

data.rename(columns={"star_rating": "rating", "content_rating": "content"}, inplace=True)
data.columns = data.columns.str.lower()
print(data.head(3), end="\n\n")

print(data.describe())

   star_rating                     title content_rating  genre  duration
0          9.3  The Shawshank Redemption              R  Crime       142
1          9.2             The Godfather              R  Crime       175
2          9.1    The Godfather: Part II              R  Crime       200

   star_rating                   title content_rating   genre  duration
1          9.2           The Godfather              R   Crime       175
2          9.1  The Godfather: Part II              R   Crime       200
3          9.0         The Dark Knight          PG-13  Action       152

   rating                   title content   genre  duration
1     9.2           The Godfather       R   Crime       175
2     9.1  The Godfather: Part II       R   Crime       200
3     9.0         The Dark Knight   PG-13  Action       152

           rating    duration
count  978.000000  978.000000
mean     7.888344  120.958078
std      0.333198   26.222793
min      7.400000   64.000000
25%      7.600000  102.0000