In [1]:
import pandas as pd
import numpy as np

# Working with Series 

### 1) creating the series 

In [2]:
lst = [1,2,3,4,5]
pd.Series(lst)

0    1
1    2
2    3
3    4
4    5
dtype: int64

#### numpy array to series 

In [3]:
arr = np.array(lst)
pd.Series(arr)

0    1
1    2
2    3
3    4
4    5
dtype: int32

### 2) creating the series by our end

In [4]:
pd.Series(data=['Abhinav','Daksh','Jagdish','Leo'],index=[1,2,3,4])


1    Abhinav
2      Daksh
3    Jagdish
4        Leo
dtype: object

### 3) creating the series through dictionary 

In [5]:
steps = {'day1':2000,'day2':4000,'day3':1000}
steps
pd.Series(steps)

day1    2000
day2    4000
day3    1000
dtype: int64

### 4) using repeat() to create a new series

In [6]:
pd.Series(6).repeat(4)


0    6
0    6
0    6
0    6
dtype: int64

#### with accurate indexing

In [7]:
pd.Series(6).repeat(4).reset_index()


Unnamed: 0,index,0
0,0,6
1,0,6
2,0,6
3,0,6


In [8]:
s = pd.Series([10,20]).repeat([5,2]).reset_index(drop=True)
s

0    10
1    10
2    10
3    10
4    10
5    20
6    20
dtype: int64

### 5) accessing the elements

In [12]:
s[0]

10

In [14]:
s[2:4]

2    10
3    10
dtype: int64

### 6) Aggregate function on series

In [21]:
sr = pd.Series([1,2,3,4,5,6,7])
print(sr.max())
print(sr.min())
print(sr.sum())

7
1
28


### 7) Series Absolute function 

In [23]:
sr = pd.Series([1,-2,3,4,-5,6,7])
sr.abs()

0    1
1    2
2    3
3    4
4    5
5    6
6    7
dtype: int64

### 8) Appending Series

In [31]:
sr1 = pd.Series([1,-2,3,4,-5,6,7])
sr2 = pd.Series([8,-9,10,11])

pd.concat([sr1, sr2]).reset_index(drop=True)

0      1
1     -2
2      3
3      4
4     -5
5      6
6      7
7      8
8     -9
9     10
10    11
dtype: int64

### 9) Astype function

In [34]:
type(sr1[0])

numpy.int64

In [38]:
sr1.astype('float')


0    1.0
1   -2.0
2    3.0
3    4.0
4   -5.0
5    6.0
6    7.0
dtype: float64

### 10) Between function

In [39]:
sr1 = pd.Series([1,2,3,4,5,6,7,8,9,10])

sr1.between(4,8)

0    False
1    False
2    False
3     True
4     True
5     True
6     True
7     True
8    False
9    False
dtype: bool

### 11) All strings function can be used to extract or modify texts in a series

In [40]:
ser = pd.Series(['Eshant Das','Data Science','GFG','Hello World','Machine Learning'])


In [42]:
ser.str.upper()

0          ESHANT DAS
1        DATA SCIENCE
2                 GFG
3         HELLO WORLD
4    MACHINE LEARNING
dtype: object

In [43]:
ser.str.lower()

0          eshant das
1        data science
2                 gfg
3         hello world
4    machine learning
dtype: object

In [45]:
for i in ser:
    print(len(i))

10
12
3
11
16


In [47]:
ser = ser.str.strip() # all extra spaces remove 

In [50]:
ser.str.split()

0          [Eshant, Das]
1        [Data, Science]
2                  [GFG]
3         [Hello, World]
4    [Machine, Learning]
dtype: object

In [53]:
ser.str.contains('a')

0     True
1     True
2    False
3    False
4     True
dtype: bool

In [55]:
ser.str.count('a')

0    2
1    2
2    0
3    0
4    2
dtype: int64

In [57]:
ser.str.startswith('G')

0    False
1    False
2     True
3    False
4    False
dtype: bool

In [59]:
ser.str.endswith('s')

0     True
1    False
2    False
3    False
4    False
dtype: bool

In [60]:
ser.str.find('a')

0    3
1    1
2   -1
3   -1
4    1
dtype: int64

### 12) converting a series to list

In [64]:
print(ser.tolist())
type(ser.tolist())

['Eshant Das', 'Data Science', 'GFG', 'Hello World', 'Machine Learning']


list

# Detailed implementation on Pandas DataFrame

### 1) creating DataFrame

In [65]:
lst = ['Aman','Raju','Doyla','Vansh']
pd.DataFrame(lst)

Unnamed: 0,0
0,Aman
1,Raju
2,Doyla
3,Vansh


In [None]:
lt = [['A',2],['B',8],['C',90]]
pd.DataFrame(lt)

Unnamed: 0,0,1
0,A,2
1,B,8
2,C,90


In [68]:
data = {'name':['A','B','C'],'age':[9,65,34]}

pd.DataFrame(data)

Unnamed: 0,name,age
0,A,9
1,B,65
2,C,34


In [73]:
data = { 'Name'         :['Jai', 'Princi', 'Gaurav', 'Anuj'],
         'Age'          :[27, 24, 22, 32],
         'Address'      :['Delhi', 'Kanpur', 'Allahabad', 'Kannauj'],
         'Qualification':['Msc', 'MA', 'MCA', 'Phd']}

df = pd.DataFrame(data)
df[['Name','Qualification']]

Unnamed: 0,Name,Qualification
0,Jai,Msc
1,Princi,MA
2,Gaurav,MCA
3,Anuj,Phd


### 2) Slicing in Dataframes using iloc and loc

In [74]:
data = {'one'   : pd.Series([1, 2, 3, 4]),
        'two'   : pd.Series([10, 20, 30, 40]),
        'three' : pd.Series([100, 200, 300, 400]),
        'four'  : pd.Series([1000, 2000, 3000, 4000])}

df = pd.DataFrame(data)
df

Unnamed: 0,one,two,three,four
0,1,10,100,1000
1,2,20,200,2000
2,3,30,300,3000
3,4,40,400,4000


In [77]:
df.loc[1:3 , 'two':'four' ]

Unnamed: 0,two,three,four
1,20,200,2000
2,30,300,3000
3,40,400,4000


In [None]:
df.iloc[1:4,1:3] # [rows,cols]

Unnamed: 0,two,three
1,20,200
2,30,300
3,40,400


##### Selecting specific rows

In [None]:
df.iloc[[0,2],[1,3]] # rows and columns

Unnamed: 0,two,four
0,10,1000
2,30,3000


### 3) Slicing in Dataframes using conditions

In [83]:
df[df['two']>20]

Unnamed: 0,one,two,three,four
2,3,30,300,3000
3,4,40,400,4000


### 4) columns Addition in Dataframes 

In [84]:
df

Unnamed: 0,one,two,three,four
0,1,10,100,1000
1,2,20,200,2000
2,3,30,300,3000
3,4,40,400,4000


In [85]:
l = [11,22,33,44]
df['five'] = l
df

Unnamed: 0,one,two,three,four,five
0,1,10,100,1000,11
1,2,20,200,2000,22
2,3,30,300,3000,33
3,4,40,400,4000,44


### 5) column deletion 

In [86]:
del df['five']
df

Unnamed: 0,one,two,three,four
0,1,10,100,1000
1,2,20,200,2000
2,3,30,300,3000
3,4,40,400,4000


In [101]:
df['six'] = df['one']+10
df



Unnamed: 0,one,two,three,four,six
0,1,10,100,1000,11
1,2,20,200,2000,12
2,3,30,300,3000,13
3,4,40,400,4000,14


In [102]:
df.pop('six')
df

Unnamed: 0,one,two,three,four
0,1,10,100,1000
1,2,20,200,2000
2,3,30,300,3000
3,4,40,400,4000


### 6) Addition of rows

In [96]:
df1 = pd.DataFrame([[1, 2], [3, 4]], columns = ['a','b'])
df2 = pd.DataFrame([[5, 6], [7, 8]], columns = ['a','b'])


df3 = pd.concat([df1,df2]).reset_index(drop = True)

df3

Unnamed: 0,a,b
0,1,2
1,3,4
2,5,6
3,7,8


### 7) Pandas drop functions

In [116]:
data = { 'one'   : pd.Series([1, 2, 3, 4]),
         'two'   : pd.Series([10, 20, 30, 40]),
         'three' : pd.Series([100, 200, 300, 400]),
         'four'  : pd.Series([1000, 2000, 3000, 4000])}


df = pd.DataFrame(data)
df

Unnamed: 0,one,two,three,four
0,1,10,100,1000
1,2,20,200,2000
2,3,30,300,3000
3,4,40,400,4000


In [117]:
# axis = 0 for rows
# axis = 1 for columns

df.drop([0,1],axis=0,inplace=True)
print(df)

   one  two  three  four
2    3   30    300  3000
3    4   40    400  4000


In [118]:
df.drop(['one','four'],axis=1,inplace=True)
df

Unnamed: 0,two,three
2,30,300
3,40,400


### 8) transposing a dataframe

In [119]:
data = { 'one'   : pd.Series([1, 2, 3, 4]),
         'two'   : pd.Series([10, 20, 30, 40]),
         'three' : pd.Series([100, 200, 300, 400]),
         'four'  : pd.Series([1000, 2000, 3000, 4000])}

df = pd.DataFrame(data)
df

Unnamed: 0,one,two,three,four
0,1,10,100,1000
1,2,20,200,2000
2,3,30,300,3000
3,4,40,400,4000


In [120]:
df.T

Unnamed: 0,0,1,2,3
one,1,2,3,4
two,10,20,30,40
three,100,200,300,400
four,1000,2000,3000,4000


### 9) A set of more DataFrame Functionalities

In [None]:
df

Unnamed: 0,one,two,three,four
0,1,10,100,1000
1,2,20,200,2000
2,3,30,300,3000
3,4,40,400,4000


In [122]:
df.axes

[RangeIndex(start=0, stop=4, step=1),
 Index(['one', 'two', 'three', 'four'], dtype='object')]

In [123]:
df.ndim

2

In [125]:
df.dtypes

one      int64
two      int64
three    int64
four     int64
dtype: object

In [126]:
df.shape

(4, 4)

In [129]:
df.head(3)

Unnamed: 0,one,two,three,four
0,1,10,100,1000
1,2,20,200,2000
2,3,30,300,3000


In [128]:
df.tail()

Unnamed: 0,one,two,three,four
0,1,10,100,1000
1,2,20,200,2000
2,3,30,300,3000
3,4,40,400,4000


In [130]:
df = pd.DataFrame()
df.empty

True

### 10) Statistical or Mathematical Functions

In [131]:
data = {'one'   : pd.Series([1, 2, 3, 4]),
        'two'   : pd.Series([10, 20, 30, 40]),
        'three' : pd.Series([100, 200, 300, 400]),
        'four'  : pd.Series([1000, 2000, 3000, 4000])}

df = pd.DataFrame(data)
df

Unnamed: 0,one,two,three,four
0,1,10,100,1000
1,2,20,200,2000
2,3,30,300,3000
3,4,40,400,4000


In [134]:
df.sum()

one         10
two        100
three     1000
four     10000
dtype: int64

In [135]:
df.mean()

one         2.5
two        25.0
three     250.0
four     2500.0
dtype: float64

In [136]:
df.median()

one         2.5
two        25.0
three     250.0
four     2500.0
dtype: float64

In [132]:
de = pd.DataFrame({'A': [1, 2, 2, 3, 4, 4, 4, 5], 'B': [10, 20, 20, 30, 40, 40, 50, 60]})

print('A' , de['A'].mode())
print('B' , de['B'].mode())

A 0    4
Name: A, dtype: int64
B 0    20
1    40
Name: B, dtype: int64


In [138]:
df.min()

one         1
two        10
three     100
four     1000
dtype: int64

In [139]:
df.max()

one         4
two        40
three     400
four     4000
dtype: int64

In [140]:
df.var()

one      1.666667e+00
two      1.666667e+02
three    1.666667e+04
four     1.666667e+06
dtype: float64

In [141]:
df.std()

one         1.290994
two        12.909944
three     129.099445
four     1290.994449
dtype: float64

### 11) Describe function

In [142]:
data = {'one'  : pd.Series([1, 2, 3, 4]),
        'two'  : pd.Series([10, 20, 30, 40]),
        'three': pd.Series([100, 200, 300, 400]),
        'four' : pd.Series([1000, 2000, 3000, 4000]),
        'five' : pd.Series(['A','B','C','D'])}


df = pd.DataFrame(data)

df.describe()

Unnamed: 0,one,two,three,four
count,4.0,4.0,4.0,4.0
mean,2.5,25.0,250.0,2500.0
std,1.290994,12.909944,129.099445,1290.994449
min,1.0,10.0,100.0,1000.0
25%,1.75,17.5,175.0,1750.0
50%,2.5,25.0,250.0,2500.0
75%,3.25,32.5,325.0,3250.0
max,4.0,40.0,400.0,4000.0


### 12) pipe functions 

In [145]:
data = {'one'  : pd.Series([1, 2, 3, 4]),
        'two'  : pd.Series([10, 20, 30, 40]),
        'three': pd.Series([100, 200, 300, 400]),
        'four' : pd.Series([1000, 2000, 3000, 4000])}


def add_(i,j):
    return i+j

def sub_(i,j):
    return i-j



df = pd.DataFrame(data)
df


Unnamed: 0,one,two,three,four
0,1,10,100,1000
1,2,20,200,2000
2,3,30,300,3000
3,4,40,400,4000


In [148]:
df.pipe(add_,20) # i give second parameter only

Unnamed: 0,one,two,three,four
0,21,30,120,1020
1,22,40,220,2020
2,23,50,320,3020
3,24,60,420,4020


In [149]:
df.pipe(sub_,20) # j give second parameter only

Unnamed: 0,one,two,three,four
0,-19,-10,80,980
1,-18,0,180,1980
2,-17,10,280,2980
3,-16,20,380,3980


In [152]:
df.pipe(add_,30).pipe(sub_,9)

Unnamed: 0,one,two,three,four
0,22,31,121,1021
1,23,41,221,2021
2,24,51,321,3021
3,25,61,421,4021


##### apply function

In [154]:
data = {'one'  : pd.Series([1, 2, 3, 4]),
        'two'  : pd.Series([10, 20, 30, 40]),
        'three': pd.Series([100, 200, 300, 400]),
        'four' : pd.Series([1000, 2000, 3000, 4000])}

df = pd.DataFrame(data)
df

print(df.apply(np.mean))

one         2.5
two        25.0
three     250.0
four     2500.0
dtype: float64


##### map function

In [155]:
df

Unnamed: 0,one,two,three,four
0,1,10,100,1000
1,2,20,200,2000
2,3,30,300,3000
3,4,40,400,4000


In [156]:
df.applymap(lambda x: x*10)

  df.applymap(lambda x: x*10)


Unnamed: 0,one,two,three,four
0,10,100,1000,10000
1,20,200,2000,20000
2,30,300,3000,30000
3,40,400,4000,40000


### 13) Reindex function

In [162]:
data = { 'one'   : pd.Series([1, 2, 3, 4]),
         'two'   : pd.Series([10, 20, 30, 40]),
         'three' : pd.Series([100, 200, 300, 400]),
         'four'  : pd.Series([1000, 2000, 3000, 4000])}

df = pd.DataFrame(data)

print(df)
print('-'*30)
print(df.reindex(index = [1,0,3,2]))

   one  two  three  four
0    1   10    100  1000
1    2   20    200  2000
2    3   30    300  3000
3    4   40    400  4000
------------------------------
   one  two  three  four
1    2   20    200  2000
0    1   10    100  1000
3    4   40    400  4000
2    3   30    300  3000


In [158]:
data = {'Name' : ['John', 'Jane', 'Jim', 'Joan'],
        'Age'  : [25, 30, 35, 40],
        'City' : ['New York', 'Los Angeles', 'Chicago', 'Houston']}

df = pd.DataFrame(data)

df.reindex(columns = ['Name','City','Age'])

Unnamed: 0,Name,City,Age
0,John,New York,25
1,Jane,Los Angeles,30
2,Jim,Chicago,35
3,Joan,Houston,40


### 14) Renaming Columns in Pandas DataFrame

In [159]:
data = { 'one'   : pd.Series([1, 2, 3, 4]),
         'two'   : pd.Series([10, 20, 30, 40]),
         'three' : pd.Series([100, 200, 300, 400]),
         'four'  : pd.Series([1000, 2000, 3000, 4000])}

df = pd.DataFrame(data)

df.rename(columns = {'one' : 'One','two': 'Two', 'three' : 'Three', 'four' : 'Four'}, 
           inplace = True, index = {0:'a',1:'b',2:'c',4:'d'})
df

Unnamed: 0,One,Two,Three,Four
a,1,10,100,1000
b,2,20,200,2000
c,3,30,300,3000
3,4,40,400,4000


### 15) Sorting in Pandas DataFrame

In [163]:
data = { 'one'   : pd.Series([11, 51, 31, 41]),
         'two'   : pd.Series([10, 20, 30, 40]),
         'three' : pd.Series([100, 200, 500, 400]),
         'four'  : pd.Series([1000, 2000, 3000, 4000])}

df = pd.DataFrame(data)
df

Unnamed: 0,one,two,three,four
0,11,10,100,1000
1,51,20,200,2000
2,31,30,500,3000
3,41,40,400,4000


##### sort with specific column

In [164]:
df.sort_values(by ='one')

Unnamed: 0,one,two,three,four
0,11,10,100,1000
2,31,30,500,3000
3,41,40,400,4000
1,51,20,200,2000


In [166]:
df.sort_values(by = ['one','two'], ascending = True)

Unnamed: 0,one,two,three,four
0,11,10,100,1000
2,31,30,500,3000
3,41,40,400,4000
1,51,20,200,2000


##### sort in specific order

In [165]:
df.sort_values(by = 'one', ascending = False)

Unnamed: 0,one,two,three,four
1,51,20,200,2000
3,41,40,400,4000
2,31,30,500,3000
0,11,10,100,1000


##### sort with specific algorithms

In [169]:
df.sort_values(by = ['one'], kind = 'mergesort')

Unnamed: 0,one,two,three,four
0,11,10,100,1000
2,31,30,500,3000
3,41,40,400,4000
1,51,20,200,2000


### 16) Groupby Functions

In [171]:
cricket = {'Team'   : ['India', 'India', 'Australia', 'Australia', 'SA', 'SA', 'SA', 'SA', 'NZ', 'NZ', 'NZ', 'India'],
           'Rank'   : [2, 3, 1,2, 3,4 ,1 ,1,2 , 4,1,2],
           'Year'   : [2014,2015,2014,2015,2014,2015,2016,2017,2016,2014,2015,2017],
           'Points' : [876,801,891,815,776,784,834,824,758,691,883,782]}

df = pd.DataFrame(cricket)
df

Unnamed: 0,Team,Rank,Year,Points
0,India,2,2014,876
1,India,3,2015,801
2,Australia,1,2014,891
3,Australia,2,2015,815
4,SA,3,2014,776
5,SA,4,2015,784
6,SA,1,2016,834
7,SA,1,2017,824
8,NZ,2,2016,758
9,NZ,4,2014,691


In [174]:
df.groupby('Team').groups

{'Australia': [2, 3], 'India': [0, 1, 11], 'NZ': [8, 9, 10], 'SA': [4, 5, 6, 7]}

In [175]:
df.groupby(['Team','Year']).get_group(('Australia',2014))

Unnamed: 0,Team,Rank,Year,Points
2,Australia,1,2014,891


In [181]:
df.groupby('Team')['Points'].sum()

Team
Australia    1706
India        2459
NZ           2332
SA           3218
Name: Points, dtype: int64

In [182]:
df.groupby('Team')['Points'].sum().sort_values()


Team
Australia    1706
NZ           2332
India        2459
SA           3218
Name: Points, dtype: int64

In [183]:
df.groupby('Team')['Points'].max()


Team
Australia    891
India        876
NZ           883
SA           834
Name: Points, dtype: int64

In [186]:
groups = df.groupby('Team')

type(groups['Points'])
groups['Points'].mean()

Team
Australia    853.000000
India        819.666667
NZ           777.333333
SA           804.500000
Name: Points, dtype: float64

In [187]:
df.groupby('Team').filter(lambda x: len(x)==4)

Unnamed: 0,Team,Rank,Year,Points
4,SA,3,2014,776
5,SA,4,2015,784
6,SA,1,2016,834
7,SA,1,2017,824


In [188]:
df.groupby('Team').filter(lambda x: len(x)==3)


Unnamed: 0,Team,Rank,Year,Points
0,India,2,2014,876
1,India,3,2015,801
8,NZ,2,2016,758
9,NZ,4,2014,691
10,NZ,1,2015,883
11,India,2,2017,782


# Working with csv files and basic data Analysis Using Pandas

### 1) Reading CSV files

In [203]:
df = pd.read_csv('Football.csv')

df.head()

Unnamed: 0,Country,League,Club,Player Names,Matches_Played,Substitution,Mins,Goals,xG,xG Per Avg Match,Shots,OnTarget,Shots Per Avg Match,On Target Per Avg Match,Year
0,Spain,La Liga,(BET),Juanmi Callejon,19,16,1849,11,6.62,0.34,48,20,2.47,1.03,2016
1,Spain,La Liga,(BAR),Antoine Griezmann,36,0,3129,16,11.86,0.36,88,41,2.67,1.24,2016
2,Spain,La Liga,(ATL),Luis Suarez,34,1,2940,28,23.21,0.75,120,57,3.88,1.84,2016
3,Spain,La Liga,(CAR),Ruben Castro,32,3,2842,13,14.06,0.47,117,42,3.91,1.4,2016
4,Spain,La Liga,(VAL),Kevin Gameiro,21,10,1745,13,10.65,0.58,50,23,2.72,1.25,2016


In [197]:
link = 'https://raw.githubusercontent.com/AshishJangra27/Data-Analysis-with-Python-GFG/main/3.%20Data%20Preprocessing%20-%20Removing%20Null%20Value%20Rows/googleplaystore.csv'

df = pd.read_csv(link)
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


### 2) Pandas Info Function

In [198]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  object 
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10840 non-null  object 
 9   Genres          10841 non-null  object 
 10  Last Updated    10841 non-null  object 
 11  Current Ver     10833 non-null  object 
 12  Android Ver     10838 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.1+ MB


### 3) isnull() function to check if there are nan values present

In [199]:
df.isnull()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10836,False,False,False,False,False,False,False,False,False,False,False,False,False
10837,False,False,False,False,False,False,False,False,False,False,False,False,False
10838,False,False,True,False,False,False,False,False,False,False,False,False,False
10839,False,False,False,False,False,False,False,False,False,False,False,False,False


In [200]:
df.isnull().sum()

App                  0
Category             0
Rating            1474
Reviews              0
Size                 0
Installs             0
Type                 1
Price                0
Content Rating       1
Genres               0
Last Updated         0
Current Ver          8
Android Ver          3
dtype: int64

### 4)  Quantile function to get the specific percentile value

In [204]:
df.describe(percentiles=[.80])

Unnamed: 0,Matches_Played,Substitution,Mins,Goals,xG,xG Per Avg Match,Shots,OnTarget,Shots Per Avg Match,On Target Per Avg Match,Year
count,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0
mean,22.371212,3.224242,2071.416667,11.810606,10.089606,0.476167,64.177273,28.365152,2.948015,1.315652,2018.363636
std,9.754658,3.839498,900.595049,6.075315,5.724844,0.192831,34.941622,16.363149,0.914906,0.474239,1.3677
min,2.0,0.0,264.0,2.0,0.71,0.07,5.0,2.0,0.8,0.24,2016.0
50%,24.0,2.0,2245.5,11.0,9.285,0.435,62.0,26.0,2.845,1.25,2019.0
80%,32.0,6.0,2915.8,15.0,14.076,0.61,90.0,39.0,3.6,1.63,2020.0
max,38.0,26.0,4177.0,42.0,32.54,1.35,208.0,102.0,7.2,3.63,2020.0


In [205]:
df['Mins'].quantile(.80)

2915.8

In [206]:
df['Mins'].quantile(.99)


3520.0199999999995

### 5) Copy function

In [207]:
de = df.copy()
df.head(3)

Unnamed: 0,Country,League,Club,Player Names,Matches_Played,Substitution,Mins,Goals,xG,xG Per Avg Match,Shots,OnTarget,Shots Per Avg Match,On Target Per Avg Match,Year
0,Spain,La Liga,(BET),Juanmi Callejon,19,16,1849,11,6.62,0.34,48,20,2.47,1.03,2016
1,Spain,La Liga,(BAR),Antoine Griezmann,36,0,3129,16,11.86,0.36,88,41,2.67,1.24,2016
2,Spain,La Liga,(ATL),Luis Suarez,34,1,2940,28,23.21,0.75,120,57,3.88,1.84,2016


In [208]:
de['Year+100'] = de['Year']+100

In [209]:
de.head()

Unnamed: 0,Country,League,Club,Player Names,Matches_Played,Substitution,Mins,Goals,xG,xG Per Avg Match,Shots,OnTarget,Shots Per Avg Match,On Target Per Avg Match,Year,Year+100
0,Spain,La Liga,(BET),Juanmi Callejon,19,16,1849,11,6.62,0.34,48,20,2.47,1.03,2016,2116
1,Spain,La Liga,(BAR),Antoine Griezmann,36,0,3129,16,11.86,0.36,88,41,2.67,1.24,2016,2116
2,Spain,La Liga,(ATL),Luis Suarez,34,1,2940,28,23.21,0.75,120,57,3.88,1.84,2016,2116
3,Spain,La Liga,(CAR),Ruben Castro,32,3,2842,13,14.06,0.47,117,42,3.91,1.4,2016,2116
4,Spain,La Liga,(VAL),Kevin Gameiro,21,10,1745,13,10.65,0.58,50,23,2.72,1.25,2016,2116


### 6) value counts function

In [212]:
df['Player Names'].value_counts()

Player Names
Andrea Belotti     5
Lionel Messi       5
Luis Suarez        5
Andrej Kramaric    5
Ciro Immobile      5
                  ..
Francois Kamano    1
Lebo Mothiba       1
Gaetan Laborde     1
Falcao             1
Cody Gakpo         1
Name: count, Length: 444, dtype: int64

### 7) Unique and Nunique function

In [213]:
df['Player Names'].unique()

array(['Juanmi Callejon', 'Antoine Griezmann', 'Luis Suarez',
       'Ruben Castro', 'Kevin Gameiro', 'Cristiano Ronaldo',
       'Karim Benzema', 'Neymar ', 'Iago Aspas', 'Sergi Enrich',
       'Aduriz ', 'Sandro Ramlrez', 'Lionel Messi', 'Gerard Moreno',
       'Morata', 'Wissam Ben Yedder', 'Willian Jose', 'Andone ',
       'Cedric Bakambu', 'Isco', 'Mohamed Salah', 'Gregoire Defrel',
       'Ciro Immobile', 'Nikola Kalinic', 'Dries Mertens',
       'Alejandro Gomez', 'Jose CallejOn', 'Iago Falque',
       'Giovanni Simeone', 'Mauro Icardi', 'Diego Falcinelli',
       'Cyril Thereau', 'Edin Dzeko', 'Lorenzo Insigne',
       'Fabio Quagliarella', 'Borriello ', 'Carlos Bacca',
       'Gonzalo Higuain', 'Keita Balde', 'Andrea Belotti', 'Fin Bartels',
       'Lars Stindl', 'Serge Gnabry', 'Wagner ', 'Andrej Kramaric',
       'Florian Niederlechner', 'Robert Lewandowski', 'Emil Forsberg',
       'Timo Werner', 'Nils Petersen', 'Vedad Ibisevic', 'Mario Gomez',
       'Maximilian Philipp',

In [214]:
df['Player Names'].nunique()


444

### 8) Dropna function

In [215]:
link = 'https://raw.githubusercontent.com/AshishJangra27/Data-Analysis-with-Python-GFG/main/3.%20Data%20Preprocessing%20-%20Removing%20Null%20Value%20Rows/googleplaystore.csv'

df = pd.read_csv(link)
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [216]:
df.isnull().sum()

App                  0
Category             0
Rating            1474
Reviews              0
Size                 0
Installs             0
Type                 1
Price                0
Content Rating       1
Genres               0
Last Updated         0
Current Ver          8
Android Ver          3
dtype: int64

In [220]:
df.dropna(inplace=True)

In [221]:
df.isnull().sum()

App               0
Category          0
Rating            0
Reviews           0
Size              0
Installs          0
Type              0
Price             0
Content Rating    0
Genres            0
Last Updated      0
Current Ver       0
Android Ver       0
dtype: int64

### 9) fillna function

In [228]:
link = 'https://raw.githubusercontent.com/AshishJangra27/Data-Analysis-with-Python-GFG/main/3.%20Data%20Preprocessing%20-%20Removing%20Null%20Value%20Rows/googleplaystore.csv'

df = pd.read_csv(link)
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [229]:
x = round(df["Rating"].mean(),2)
df["Rating"].fillna(x, inplace=True)


In [230]:
df.isnull().sum()

App               0
Category          0
Rating            0
Reviews           0
Size              0
Installs          0
Type              1
Price             0
Content Rating    1
Genres            0
Last Updated      0
Current Ver       8
Android Ver       3
dtype: int64

### 10) Sample function

In [231]:
df

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.10,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.90,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.70,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.50,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.30,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10836,Sya9a Maroc - FR,FAMILY,4.50,38,53M,"5,000+",Free,0,Everyone,Education,"July 25, 2017",1.48,4.1 and up
10837,Fr. Mike Schmitz Audio Teachings,FAMILY,5.00,4,3.6M,100+,Free,0,Everyone,Education,"July 6, 2018",1.0,4.1 and up
10838,Parkinson Exercices FR,MEDICAL,4.19,3,9.5M,"1,000+",Free,0,Everyone,Medical,"January 20, 2017",1.0,2.2 and up
10839,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,4.50,114,Varies with device,"1,000+",Free,0,Mature 17+,Books & Reference,"January 19, 2015",Varies with device,Varies with device


In [233]:
df.sample(8)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
4738,WPSApp,TOOLS,4.4,95080,3.1M,"10,000,000+",Free,0,Everyone,Tools,"August 3, 2018",1.6.22,4.1 and up
7610,Ski Safari: Adventure Time,FAMILY,4.5,48754,9.3M,"100,000+",Paid,$0.99,Everyone,Arcade;Action & Adventure,"October 7, 2014",1.5.2,2.3 and up
8946,WiFi Action Camera,SPORTS,2.4,551,903k,"100,000+",Free,0,Everyone,Sports,"March 13, 2014",1.4,4.0 and up
9988,Design Home,FAMILY,4.4,539931,69M,"10,000,000+",Free,0,Everyone,Simulation,"August 6, 2018",1.11.01,4.2 and up
8319,DF Squid,GAME,4.19,1,14M,100+,Free,0,Everyone,Board,"December 12, 2013",1.0,2.3 and up
6543,Bangla Calendar 1425: (EN-BN-AR) Holiday,LIFESTYLE,4.5,31,2.3M,"1,000+",Free,0,Everyone,Lifestyle,"April 21, 2018",2.1.0,4.1 and up
9941,Tasker,TOOLS,4.6,43045,Varies with device,"1,000,000+",Paid,$2.99,Everyone,Tools,"June 25, 2018",Varies with device,Varies with device
9559,Adivina el cantante de Trap y Reggaeton,GAME,4.5,1914,24M,"100,000+",Free,0,Everyone,Trivia,"June 29, 2018",3.6.7z,4.0.3 and up


### 11) to_csv() function

In [235]:
data = { 'one'   : pd.Series([1, 2, 3, 4]),
         'two'   : pd.Series([10, 20, 30, 40]),
         'three' : pd.Series([100, 200, 300, 400]),
         'four'  : pd.Series([1000, 2000, 3000, 4000])}

df = pd.DataFrame(data)

df.to_csv('Number.csv')

In [238]:
df = pd.read_csv('Number.csv')
df

Unnamed: 0.1,Unnamed: 0,one,two,three,four
0,0,1,10,100,1000
1,1,2,20,200,2000
2,2,3,30,300,3000
3,3,4,40,400,4000


In [239]:
df.to_csv('Numbers.csv',index=False)

In [240]:
df

Unnamed: 0.1,Unnamed: 0,one,two,three,four
0,0,1,10,100,1000
1,1,2,20,200,2000
2,2,3,30,300,3000
3,3,4,40,400,4000


# A detailed Pandas Profile report

In [None]:
import matplotlib
import pandas_profiling as pp

ModuleNotFoundError: No module named 'ydata_profiling'

In [250]:
df = pd.read_csv('Football.csv')
df.head()

Unnamed: 0,Country,League,Club,Player Names,Matches_Played,Substitution,Mins,Goals,xG,xG Per Avg Match,Shots,OnTarget,Shots Per Avg Match,On Target Per Avg Match,Year
0,Spain,La Liga,(BET),Juanmi Callejon,19,16,1849,11,6.62,0.34,48,20,2.47,1.03,2016
1,Spain,La Liga,(BAR),Antoine Griezmann,36,0,3129,16,11.86,0.36,88,41,2.67,1.24,2016
2,Spain,La Liga,(ATL),Luis Suarez,34,1,2940,28,23.21,0.75,120,57,3.88,1.84,2016
3,Spain,La Liga,(CAR),Ruben Castro,32,3,2842,13,14.06,0.47,117,42,3.91,1.4,2016
4,Spain,La Liga,(VAL),Kevin Gameiro,21,10,1745,13,10.65,0.58,50,23,2.72,1.25,2016


In [247]:
report = pp.ProfileReport(df)

NameError: name 'pp' is not defined

In [None]:
report