In [8]:
import pandas as pd
import numpy as np

In [3]:
df = pd.DataFrame({
  "Accessories": ["Laptop", "Laptop", "Ipad", "Ipad", "Tablet", "Laptop"],
  "customer": ["Andrew", "Andrew", "Tom", "Andrew", "Tobey", "Peter"],
  "quantity": [1, 2, 2, 3, 1, 2],
})

df

Unnamed: 0,Accessories,customer,quantity
0,Laptop,Andrew,1
1,Laptop,Andrew,2
2,Ipad,Tom,2
3,Ipad,Andrew,3
4,Tablet,Tobey,1
5,Laptop,Peter,2


In [4]:
df.groupby(["Accessories", "customer"])["quantity"].sum()

Accessories  customer
Ipad         Andrew      3
             Tom         2
Laptop       Andrew      3
             Peter       2
Tablet       Tobey       1
Name: quantity, dtype: int64

In [5]:
df.pivot(index="Accessories", columns="customer", values="quantity")

ValueError: Index contains duplicate entries, cannot reshape

In [6]:
df.set_index(["Accessories", "customer"])["quantity"]

Accessories  customer
Laptop       Andrew      1
             Andrew      2
Ipad         Tom         2
             Andrew      3
Tablet       Tobey       1
Laptop       Peter       2
Name: quantity, dtype: int64

## Problem Statement

- Given the data frame df as input, do the following steps for preprocessing:

1. Remove the row if all the columns have missing values.

2. Replace the missing values of "Roll_ID" column with 0 and "Name" column with "Anonymous"

3. Replace the missing values in "Marks" column with the median value of the column

4. Change the numerical columns (Roll_ID and Marks) to int datatype in the output

`Note: Input data frame df has every column of an object type`

In [35]:
data = pd.DataFrame({
                        "Roll_ID": [412, np.nan, 456, np.nan, 434, 429, 418], 
                        "Name": ["John", "Mitra", "Ritz", np.nan, "Anny", "Hema", np.nan], 
                        "Marks": [np.nan, 32, 25, np.nan, 35, 28, 38]
                })

test = data
data

Unnamed: 0,Roll_ID,Name,Marks
0,412.0,John,
1,,Mitra,32.0
2,456.0,Ritz,25.0
3,,,
4,434.0,Anny,35.0
5,429.0,Hema,28.0
6,418.0,,38.0


In [21]:
data.dropna(how = "all", inplace = True)
data

Unnamed: 0,Roll_ID,Name,Marks
0,412.0,John,
1,,Mitra,32.0
2,456.0,Ritz,25.0
4,434.0,Anny,35.0
5,429.0,Hema,28.0
6,418.0,,38.0


In [24]:
data.fillna(value = {"Roll_ID": 0, "Name": "Anonymous"}, inplace = True)
data

Unnamed: 0,Roll_ID,Name,Marks
0,412.0,John,
1,0.0,Mitra,32.0
2,456.0,Ritz,25.0
4,434.0,Anny,35.0
5,429.0,Hema,28.0
6,418.0,Anonymous,38.0


In [29]:
data["Marks"].fillna(data["Marks"].median(), inplace = True)
data

Unnamed: 0,Roll_ID,Name,Marks
0,412.0,John,32.0
1,0.0,Mitra,32.0
2,456.0,Ritz,25.0
4,434.0,Anny,35.0
5,429.0,Hema,28.0
6,418.0,Anonymous,38.0


In [33]:
data = data.astype({"Roll_ID": "int", "Marks": "int"})

In [34]:
data

Unnamed: 0,Roll_ID,Name,Marks
0,412,John,32
1,0,Mitra,32
2,456,Ritz,25
4,434,Anny,35
5,429,Hema,28
6,418,Anonymous,38


In [36]:
test

Unnamed: 0,Roll_ID,Name,Marks
0,412.0,John,
1,,Mitra,32.0
2,456.0,Ritz,25.0
3,,,
4,434.0,Anny,35.0
5,429.0,Hema,28.0
6,418.0,,38.0


In [42]:
test.isna().sum()

Roll_ID    2
Name       2
Marks      2
dtype: int64

In [43]:
test["Roll_ID"].isna().sum()

2

In [44]:
test.isna().sum().sum()

6

In [50]:
test.loc[[3]].isna().sum().sum()

3

In [51]:
def fun(test):
    return test.fillna(1, inplace = True)

fun(test)

In [52]:
test

Unnamed: 0,Roll_ID,Name,Marks
0,412.0,John,1.0
1,1.0,Mitra,32.0
2,456.0,Ritz,25.0
3,1.0,1,1.0
4,434.0,Anny,35.0
5,429.0,Hema,28.0
6,418.0,1,38.0


In [63]:
test.loc[test["Name"].str.contains('n', na = False)].shape[0]

2

In [64]:
test["Name"].str.contains('n', na = False)

7

In [65]:
str = pd.DataFrame({'access_id': {0: 'ORG6684',   1: '4564',   2: 'ORG6995',   3: '2130',   4: '5839',   5: 'ORG1281',   6: 'ORG2651',   7: 'ORG9870',   8: 'ORG4089',   9: 'ORG3794'}})

str

Unnamed: 0,access_id
0,ORG6684
1,4564
2,ORG6995
3,2130
4,5839
5,ORG1281
6,ORG2651
7,ORG9870
8,ORG4089
9,ORG3794


In [78]:
valid_id_mask = str["access_id"].str.startswith("ORG").count()
valid_id_mask

10

In [76]:
str.loc[valid_id_mask].access_id.count()

7

In [77]:
str.shape[0]

10

In [187]:
stu = pd.DataFrame({'Date':["2015-12-06", "2011-12-27", "2015-09-07", "2012-12-21", "2020-02-13", "2015-06-09"], 'RID':[498, 721, 375, 464, 813, 853], 'Phy':[22, 45, 1, 65, 22, 17], 'Chem':[52, 56, 32, 50, 24, 61], 'Math':[63, 37, 68, 62, 43 ,42]})
stu

Unnamed: 0,Date,RID,Phy,Chem,Math
0,2015-12-06,498,22,52,63
1,2011-12-27,721,45,56,37
2,2015-09-07,375,1,32,68
3,2012-12-21,464,65,50,62
4,2020-02-13,813,22,24,43
5,2015-06-09,853,17,61,42


In [188]:
stu

Unnamed: 0,Date,RID,Phy,Chem,Math
0,2015-12-06,498,22,52,63
1,2011-12-27,721,45,56,37
2,2015-09-07,375,1,32,68
3,2012-12-21,464,65,50,62
4,2020-02-13,813,22,24,43
5,2015-06-09,853,17,61,42


In [189]:
stu["Date"] = pd.to_datetime(stu["Date"])

In [190]:
stu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    6 non-null      datetime64[ns]
 1   RID     6 non-null      int64         
 2   Phy     6 non-null      int64         
 3   Chem    6 non-null      int64         
 4   Math    6 non-null      int64         
dtypes: datetime64[ns](1), int64(4)
memory usage: 368.0 bytes


In [194]:
stu["Date"] = stu["Date"].dt.month_name()

In [195]:
stu

Unnamed: 0,Date,RID,Phy,Chem,Math
0,December,498,22,52,63
1,December,721,45,56,37
2,September,375,1,32,68
3,December,464,65,50,62
4,February,813,22,24,43
5,June,853,17,61,42


In [196]:
stu["Date"].value_counts().sort_index()

December     3
February     1
June         1
September    1
Name: Date, dtype: int64

In [198]:
high_freq = stu["Date"].value_counts().values[0]
high_freq

3

In [199]:
month = stu["Date"].value_counts().index[0]

In [200]:
month

'December'