### DataFrame Methods
* Dataframe methods might work different as Series

In [2]:
import pandas as pd

In [3]:
# Lets see how are they different
# So, I will create a Series and a Dataframe
some_series = pd.Series(data=[2,4,1])
some_series

0    2
1    4
2    1
dtype: int64

In [4]:
# Creating a Dataframe
data = {
    "Column1": [6,7,8],
    "Column2": [1,2,4]
}
some_df = pd.DataFrame(data=data)
some_df

Unnamed: 0,Column1,Column2
0,6,1
1,7,2
2,8,4


In [5]:
# Lets see the difference in sum method
print(some_series.sum())
print()
print(some_df.sum()) # Dataframe sum each value per column and return a Series

7

Column1    21
Column2     7
dtype: int64


In [6]:
# In Dataframe if we want to sum horizontally, we can passs axis in it
some_df.sum(axis=1)

0     7
1     9
2    12
dtype: int64

In [7]:
# or we can sum it on columns and indexed
some_df.sum('columns') # This is same as axis=1

0     7
1     9
2    12
dtype: int64

In [8]:
some_df.sum('rows') # This is same as axis=0

Column1    21
Column2     7
dtype: int64

In [9]:
# Lets see info and describe
# Reading a new file

df = pd.read_csv("imdb_top_1000.csv")
df.info()
# above method will return Types
# whether if it has missing values or not.
# memory usage
# Range Index
# Total columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Poster_Link    1000 non-null   object 
 1   Series_Title   1000 non-null   object 
 2   Released_Year  1000 non-null   object 
 3   Certificate    899 non-null    object 
 4   Runtime        1000 non-null   object 
 5   Genre          1000 non-null   object 
 6   IMDB_Rating    1000 non-null   float64
 7   Overview       1000 non-null   object 
 8   Meta_score     843 non-null    float64
 9   Director       1000 non-null   object 
 10  Star1          1000 non-null   object 
 11  Star2          1000 non-null   object 
 12  Star3          1000 non-null   object 
 13  Star4          1000 non-null   object 
 14  No_of_Votes    1000 non-null   int64  
 15  Gross          831 non-null    object 
dtypes: float64(2), int64(1), object(13)
memory usage: 125.1+ KB


In [10]:
# There is another method called
# describe: Provides a summary of statistics for numerical columns in the DataFrame.
df.describe()
# Firstly this column just describe for the Numeric type columns
# but if we see then Gross column contains numbers but type is object.
# So, lets try to modify Gross column and thne it should reflect in describe() call too.

# By default, df.describe() works on numerical columns. 
# However, if you pass include='all', it will summarize both numerical and categorical columns.

Unnamed: 0,IMDB_Rating,Meta_score,No_of_Votes
count,1000.0,843.0,1000.0
mean,7.9493,77.97153,273692.9
std,0.275491,12.376099,327372.7
min,7.6,28.0,25088.0
25%,7.7,70.0,55526.25
50%,7.9,79.0,138548.5
75%,8.1,87.0,374161.2
max,9.3,100.0,2343110.0


In [16]:
# we can even round it
df.describe().round()

Unnamed: 0,IMDB_Rating,Meta_score,No_of_Votes
count,1000.0,843.0,1000.0
mean,8.0,78.0,273693.0
std,0.0,12.0,327373.0
min,8.0,28.0,25088.0
25%,8.0,70.0,55526.0
50%,8.0,79.0,138548.0
75%,8.0,87.0,374161.0
max,9.0,100.0,2343110.0


In [17]:
# Suppose in my csv file in Gross column, we have commas in the number
df['Gross']

0       28,341,469
1      134,966,411
2      534,858,444
3       57,300,000
4        4,360,000
          ...     
995            NaN
996            NaN
997     30,500,000
998            NaN
999            NaN
Name: Gross, Length: 1000, dtype: object

In [24]:
# now if we want to remove commas from the number then
df['Gross'] = df['Gross'].str.replace(',', '')
df['Gross']

0       28341469
1      134966411
2      534858444
3       57300000
4        4360000
         ...    
995          NaN
996          NaN
997     30500000
998          NaN
999          NaN
Name: Gross, Length: 1000, dtype: object

In [25]:
# Now on above we can use fillna
# fillna does - it fills up each NaN value to some value, and we will set inplace = True
# setting inplace = True will help us overwrite this dataframe
df['Gross'].fillna(0, inplace=True) # here we set it to replace it with 0.
df['Gross'] = df['Gross'].astype('int64') # This will help us convert Gross column into int type.
df

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,https://m.media-amazon.com/images/M/MV5BNGEwMT...,Breakfast at Tiffany's,1961,A,115 min,"Comedy, Drama, Romance",7.6,A young New York socialite becomes interested ...,76.0,Blake Edwards,Audrey Hepburn,George Peppard,Patricia Neal,Buddy Ebsen,166544,0
996,https://m.media-amazon.com/images/M/MV5BODk3Yj...,Giant,1956,G,201 min,"Drama, Western",7.6,Sprawling epic covering the life of a Texas ca...,84.0,George Stevens,Elizabeth Taylor,Rock Hudson,James Dean,Carroll Baker,34075,0
997,https://m.media-amazon.com/images/M/MV5BM2U3Yz...,From Here to Eternity,1953,Passed,118 min,"Drama, Romance, War",7.6,"In Hawaii in 1941, a private is cruelly punish...",85.0,Fred Zinnemann,Burt Lancaster,Montgomery Clift,Deborah Kerr,Donna Reed,43374,30500000
998,https://m.media-amazon.com/images/M/MV5BZTBmMj...,Lifeboat,1944,,97 min,"Drama, War",7.6,Several survivors of a torpedoed merchant ship...,78.0,Alfred Hitchcock,Tallulah Bankhead,John Hodiak,Walter Slezak,William Bendix,26471,0


In [26]:
df.describe().round()
# Now above thing added one more column - Gross

Unnamed: 0,IMDB_Rating,Meta_score,No_of_Votes,Gross
count,1000.0,843.0,1000.0,1000.0
mean,8.0,78.0,273693.0,56536878.0
std,0.0,12.0,327373.0,103238179.0
min,8.0,28.0,25088.0,0.0
25%,8.0,70.0,55526.0,445710.0
50%,8.0,79.0,138548.0,10702752.0
75%,8.0,87.0,374161.0,61539891.0
max,9.0,100.0,2343110.0,936662225.0


In [27]:
# we can get the count
df.count()

Poster_Link      1000
Series_Title     1000
Released_Year    1000
Certificate       899
Runtime          1000
Genre            1000
IMDB_Rating      1000
Overview         1000
Meta_score        843
Director         1000
Star1            1000
Star2            1000
Star3            1000
Star4            1000
No_of_Votes      1000
Gross            1000
dtype: int64

In [28]:
# we can now calculate mean
df.mean(numeric_only=True).round()
# we are setting numeric_only to True
# because our dataframe contains string and numeric both types.
# so we set to pick only numeric type.

IMDB_Rating           8.0
Meta_score           78.0
No_of_Votes      273693.0
Gross          56536878.0
dtype: float64

In [11]:
df.max(numeric_only=True).round()

IMDB_Rating          9.0
Meta_score         100.0
No_of_Votes    2343110.0
dtype: float64

In [12]:
df.min(numeric_only=True).round()

IMDB_Rating        8.0
Meta_score        28.0
No_of_Votes    25088.0
dtype: float64

In [14]:
df.describe(include=['O']) # including other values too
# In below result
# Count tells total elements in each column
# How many are unique values in them
# top: this returns which is the most repeated value
# frequency: How many times that most frequency duplicated value appeared.

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,Overview,Director,Star1,Star2,Star3,Star4,Gross
count,1000,1000,1000,899,1000,1000,1000,1000,1000,1000,1000,1000,831
unique,1000,999,100,16,140,202,1000,548,660,841,891,939,823
top,https://m.media-amazon.com/images/M/MV5BODIxNj...,Drishyam,2014,U,130 min,Drama,Alcatraz is the most secure prison of its time...,Alfred Hitchcock,Tom Hanks,Emma Watson,Rupert Grint,Michael Caine,4360000
freq,1,2,32,234,23,85,1,14,12,7,5,4,5
