# Installation 

    pip install pandas


# Data Structures supported by pandas are:
* Series        ==> 1D
* Data Frame    ==> 2D
* Panel         ==> 3D

# Series

In [None]:
# Can be created using following syntax

# pandas.Series( data, index, dtype, copy)

# data ==> ndarray, list, contants
# index ==> unique value, same length as data. default value np.arange(n)
# dtype ==> data type
# copy ==> copy data

# A series can be created using various inputs like −

# Array
# Dict
# Scalar value or constant

In [None]:
# empty series
import pandas as pd
a = pd.Series(dtype='float64')
print(a)

In [None]:
# series from ndarray
import pandas as pd
import numpy as np

data = np.arange(0, 30, 3)

# print(data)

s = pd.Series(data)
print(s)

In [None]:
# series from array
import pandas as pd
import numpy as np

data = np.array(['a','b','c','d','e','f'])
s = pd.Series(data)
print(s)

In [None]:
# series from ndarray
import pandas as pd
import numpy as np

data = np.array(['a','b','c','d','e','f'])
s = pd.Series(data)
s

In [None]:
# series from ndarray and index
import pandas as pd
import numpy as np
data = np.array(['a','b','c','d'])
s = pd.Series(data,index=[11,12,13,14])
print (s)

In [None]:
# series from dict

import pandas as pd
import numpy as np
data = {'fn':'John', 'ln':'Rambo', 'mn':'9886098860', 'country':"India"}
s = pd.Series(data)
print (s)

In [None]:
# series from dict and integer values

import pandas as pd
import numpy as np
data = {'a' : 0, 'b' : 1, 'c' : 2}
s = pd.Series(data)
print (s)

In [None]:
# series from dict and integer values with indexing

import pandas as pd
import numpy as np
data = {'a' : 0, 'b' : 1, 'c' : 2}
s = pd.Series(data, index=['c', 'a', 'z', 'b'])
print (s)

In [None]:
# Series from Scalar

import pandas as pd
import numpy as np
s = pd.Series(5, index=[0, 1, 2, 3])
print (s)

In [None]:
# Accessing Data from Series with Position

import pandas as pd
s = pd.Series([1,2,3,4,5])

print(s)
print(s[3])


In [None]:
import pandas as pd
s = pd.Series([1,2,3,4,5],index = ['a','b','c','d','e'])

print(s)
print(s['c'])


In [None]:
import pandas as pd
s = pd.Series([1,2,3,4,5],index = ['a','b','c','d','e'])

# indexing ==> say get the first 3 elements
print (s[:3])

# last 3 elements
print(s[-3:])

In [None]:
# Fetching data using label(index)
import pandas as pd
s = pd.Series([1,2,3,4,5],index = ['a','b','c','d','e'])

print(s)
print(s['d'])

# DataFrame

In [None]:
# DataFrame can be created using following syntax

# pandas.DataFrame( data, index, columns, dtype, copy)

# data can be,
    # Lists
    # dict
    # Series
    # Numpy ndarrays
    # Another DataFrame

In [None]:
# Empty DataFrame

import pandas as pd
df = pd.DataFrame()
print(df)

In [None]:
# list dataframe

import pandas as pd
data = [1,2,3,4,5]
df = pd.DataFrame(data)
print(df)

In [None]:
import pandas as pd
data = [['Alex',10],['Bob',12],['Clarke',13]]
df = pd.DataFrame(data,columns=['Name','Age'])
print(df)

In [None]:
import pandas as pd
data = [['u01', 'John', 'Rambo', '9886098860', 'john@gmail.com'],
        ['u02', 'Suresh', 'Vemparala', '9845098450', 'suresh@yahoomail.com'],
        ['u03', 'Michelle', 'Obama', '11223344', 'michelle@gmail.com'],
        ['u04', 'Nithin', 'Gadkari', '8899776655', 'nithin@hotmail.com'],
        ['u05', 'Rahul', 'Gandhi', '******', 'rahul@congress.com'],
       ['u06', 'Rajani', 'Kanth', '998877', 'gmail@rajani.com']]
df = pd.DataFrame(data,columns=['uid','first-name', 'last-name', 'mobile-number', 'email-id'])
print(df)



In [None]:
import pandas as pd
import numpy as np
data = [['Alex',10.],['Bob',12],['Clarke',13]]
df = pd.DataFrame(data,columns=['Name','Age'])

print(df)

print(df.dtypes)


In [None]:
# DataFrame from Dict of ndarrays / Lists
import pandas as pd
data = {'Name':['Tom', 'Jack', 'Steve', 'Ricky'],'Age':[28,34,29,42]}
df = pd.DataFrame(data)
print(df)

In [None]:
# DataFrame using arrays

import pandas as pd
data = {'Name':['Tom', 'Jack', 'Steve', 'Ricky'],'Age':[28,34,29,42]}
df = pd.DataFrame(data, index=['rank1','rank2','rank3','rank4'])
print(df)

In [None]:
# create a DataFrame by passing a list of dictionaries.

import pandas as pd
data = [{'a': 1, 'b': 2},{'a': 5, 'b': 10, 'c': 20}]
df = pd.DataFrame(data)
print(df

In [None]:
# DataFrame with a list of dictionaries, row indices, and column indices.


import pandas as pd
data = [{'a': 1, 'b': 2},{'a': 5, 'b': 10, 'c': 20}]

#With two column indices, values same as dictionary keys
df1 = pd.DataFrame(data, index=['first', 'second'], columns=['a', 'b'])

#With two column indices with one index with other name
df2 = pd.DataFrame(data, index=['first', 'second'], columns=['a', 'value'])
print(df1)
print (df2)

In [None]:
# DataFrame from Dict of Series

import pandas as pd
import numpy as np

d = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']),
   'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}

df1 = pd.DataFrame(d)
df2 = pd.DataFrame(d, dtype=np.float64)
print(df1)
print(df2)


In [21]:
# Selecting columns

import pandas as pd

d = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']),
   'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}

df = pd.DataFrame(d)
print(df ['one'])

a    1.0
b    2.0
c    3.0
d    NaN
Name: one, dtype: float64


In [25]:
# Adding new column to dataframe

import pandas as pd

d = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']),
   'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}

df = pd.DataFrame(d)

print(df)


print ("Adding a new column by passing as Series:")
df['three']=pd.Series([10,20,30],index=['a','b','c'])
print(df)

print ("Adding a new column using the existing columns in DataFrame:")
df['four']=df['one']+df['three']

print(df)

   one  two
a  1.0    1
b  2.0    2
c  3.0    3
d  NaN    4
Adding a new column by passing as Series:
   one  two  three
a  1.0    1   10.0
b  2.0    2   20.0
c  3.0    3   30.0
d  NaN    4    NaN
Adding a new column using the existing columns in DataFrame:
   one  two  three  four
a  1.0    1   10.0  11.0
b  2.0    2   20.0  22.0
c  3.0    3   30.0  33.0
d  NaN    4    NaN   NaN


In [28]:
# Deleting an column
# there are two methods 
    # del
    # pop

import pandas as pd

d = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']), 
   'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']), 
   'three' : pd.Series([10,20,30], index=['a','b','c'])}

df = pd.DataFrame(d)
print ("Our dataframe is:")
print(df)

# # using del function
print ("Deleting the column using DEL function:")
del df['one']
print(df)

# # using pop function
print ("Deleting column using POP function:")
df.pop('two')
print(df)


Our dataframe is:
   one  two  three
a  1.0    1   10.0
b  2.0    2   20.0
c  3.0    3   30.0
d  NaN    4    NaN
Deleting the column using DEL function:
   two  three
a    1   10.0
b    2   20.0
c    3   30.0
d    4    NaN
Deleting column using POP function:
   three
a   10.0
b   20.0
c   30.0
d    NaN


# Row Operations

In [31]:
# selection by label

import pandas as pd

d = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']), 
   'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}

df = pd.DataFrame(d)
print(df)
print('selecting by loc')
print(df.loc['b'])

   one  two
a  1.0    1
b  2.0    2
c  3.0    3
d  NaN    4
selecting by loc
one    2.0
two    2.0
Name: b, dtype: float64


In [33]:
# selection by integer location
import pandas as pd

d = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']),
   'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}

df = pd.DataFrame(d)
print(df)
print('selection by iloc')
print(df.iloc[2])

   one  two
a  1.0    1
b  2.0    2
c  3.0    3
d  NaN    4
selection by iloc
one    3.0
two    3.0
Name: c, dtype: float64


In [37]:
# slice operation
import pandas as pd

d = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']), 
   'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}

df = pd.DataFrame(d)
print(df)

print('slicing operation')
print(df[2:10])


   one  two
a  1.0    1
b  2.0    2
c  3.0    3
d  NaN    4
slicing operation
   one  two
c  3.0    3
d  NaN    4


In [39]:
# add new row
import pandas as pd

df = pd.DataFrame([[1, 2], [3, 4]], columns = ['a','b'])
df2 = pd.DataFrame([[5, 6], [7, 8]], columns = ['a','b'])

print("Before adding")
print(df)


print("After adding")
df = df.append(df2)
print(df)

Before adding
   a  b
0  1  2
1  3  4
After adding
   a  b
0  1  2
1  3  4
0  5  6
1  7  8


In [40]:
# deleting the row
import pandas as pd

df = pd.DataFrame([[1, 2], [3, 4]], columns = ['a','b'])
df2 = pd.DataFrame([[5, 6], [7, 8]], columns = ['a','b'])

df = df.append(df2)
print(df)

# Drop rows with label 0
df = df.drop(0)

print("after deleting")
print (df)

   a  b
0  1  2
1  3  4
0  5  6
1  7  8
after deleting
   a  b
1  3  4
1  7  8


# Panels

In [None]:
# Panel is a 3D container of data
# Panel data is partially responsible for pandas ==> pan(el) - da(ta) - s
# Panel axis
#     items
#     major_axis
#     minor_axis

# syntax:
#     pandas.Panel(data, items, major_axis, minor_axis, dtype, copy)

# data
    # Data takes various forms like ndarray, series, map, lists, dict, constants and also another DataFrame

# items
    # axis=0

# major_axis
    # axis=1

# minor_axis
    # axis=2
    
# dtype	
    # Data type of each column
    
# copy
    # Copy data. Default, false
    

# From pandas 0.20 its been deprecated and 0.25.0 its been removed
# if required, please use xarray package - https://xarray.pydata.org/en/stable/
    
    

In [46]:
# cleaning

import pandas as pd

d = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']), 
   'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}

df = pd.DataFrame(d)
print(df)


# print('data information')
# print(df.info())

# print('sum of null values')
# print(df.isnull().sum())


df.fillna(value={'one':555}, inplace=True)
print('after data cleasing')
print(df)


   one  two
a  1.0    1
b  2.0    2
c  3.0    3
d  NaN    4
after data cleasing
     one  two
a    1.0    1
b    2.0    2
c    3.0    3
d  555.0    4


In [47]:
# Reading csv files
import pandas as pd

data = pd.read_csv('demo.csv')
df = pd.DataFrame(data)
print(df)

  first-name last-name
0     Daniel      Creg
1     Elaine  Fredrick
2       John     Rambo


In [None]:
# Categorical Data ==> data type

# The categorical data type is useful in the following cases:

    # A string variable consisting of only a few different values. 
    # Converting such a string variable to a categorical variable will save some memory

    # The lexical order of a variable is not the same as the logical order (“one”, “two”, “three”). By converting to a categorical and specifying an order on the categories, sorting and min/max will use the logical order instead of the lexical order, see here.

    # As a signal to other Python libraries that this column should be treated as a categorical variable (e.g. to use suitable statistical methods or plot types).


In [52]:
import pandas as pd

s = pd.Series(["a", "b", "c", "a"], dtype="category")
print(s)


# df = pd.DataFrame({"A": ["a", "b", "c", "a"]})
# df["B"] = df["A"].astype("category")
# print(df)


print('after deleting')
print(s.drop_duplicates())

0    a
1    b
2    c
3    a
dtype: category
Categories (3, object): ['a', 'b', 'c']
after deleting
0    a
1    b
2    c
dtype: category
Categories (3, object): ['a', 'b', 'c']


In [None]:
# head()
#     head function is used to see the first few rows of the dataset.


# info() 
#     The info function highlights the total number of rows in the dataset, names of the columns, their data type, and any missing value.
#     It is used to print the summary of a data frame.


# describe()
#     The describe method computes some summary statistics for numerical columns, like mean and median. 
#     “count” is the number of non-missing values in each column.
#     Describe is good for a quick overview of numeric variables.

    
# sort_values()
#     You can sort rows using the sort_values method, passing in a column name that you want to sort by

    
# value_counts()
#     value_counts() is used when you want to see the count of unique values of various columns.
    
    
# drop_duplicates()
#     drop_duplicates() removes duplicate rows from the dataset.


# groupBy()
#     Pandas groupBy() function is used to split the data into groups using some predefined criteria.


# Pivot Table 
#     The pivot table takes column-wise data as input and groups the entries into a tabular format.

In [None]:
# conditonal filtering

# stack, unstack, set_index and reset_index are the 4 fundamental DataFrame reshaping operations.

# df.stack 
#     moves a level (or levels) of the column index into the row index.

# df.unstack
#     moves a level (or levels) of the row index into the column index.

# df.set_index 
#     moves column values into the row index

# df.reset_index 
#     moves a level (or levels) of the row index into a column of values

# Together, these 4 methods allow you to move data in your DataFrame anywhere you want -- in the columns, the row index or the column index.

In [59]:
# Apply function
# DataFrame.apply(func, axis=0, raw=False, result_type=None, args=(), **kwargs)

import pandas as pd
import numpy as np
df = pd.DataFrame([[4, 9]] * 3, columns=['A', 'B'])


df = pd.DataFrame([[4, 9], [9,25], [16, 100], [25, 225]], columns=['A', 'B'])

print(df)


print('apply function output')
# print(df.apply(np.sqrt))

# print(df.apply(np.sum, axis=0))

print(df.apply(lambda x: [1, 2], axis=1))

    A    B
0   4    9
1   9   25
2  16  100
3  25  225
apply function output
0    [1, 2]
1    [1, 2]
2    [1, 2]
3    [1, 2]
dtype: object
