In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame({
  "Accessories": ["Laptop", "Laptop", "Ipad", "Ipad", "Tablet", "Laptop"],
  "customer": ["Andrew", "Andrew", "Tom", "Andrew", "Tobey", "Peter"],
  "quantity": [1, 2, 2, 3, 1, 2],
})

df

Unnamed: 0,Accessories,customer,quantity
0,Laptop,Andrew,1
1,Laptop,Andrew,2
2,Ipad,Tom,2
3,Ipad,Andrew,3
4,Tablet,Tobey,1
5,Laptop,Peter,2


In [3]:
df.groupby(["Accessories", "customer"])["quantity"].sum()

Accessories  customer
Ipad         Andrew      3
             Tom         2
Laptop       Andrew      3
             Peter       2
Tablet       Tobey       1
Name: quantity, dtype: int64

In [4]:
df.pivot(index="Accessories", columns="customer", values="quantity")

ValueError: Index contains duplicate entries, cannot reshape

In [5]:
df.set_index(["Accessories", "customer"])["quantity"]

Accessories  customer
Laptop       Andrew      1
             Andrew      2
Ipad         Tom         2
             Andrew      3
Tablet       Tobey       1
Laptop       Peter       2
Name: quantity, dtype: int64

## Problem Statement

- Given the data frame df as input, do the following steps for preprocessing:

1. Remove the row if all the columns have missing values.

2. Replace the missing values of "Roll_ID" column with 0 and "Name" column with "Anonymous"

3. Replace the missing values in "Marks" column with the median value of the column

4. Change the numerical columns (Roll_ID and Marks) to int datatype in the output

`Note: Input data frame df has every column of an object type`

In [6]:
data = pd.DataFrame({
                        "Roll_ID": [412, np.nan, 456, np.nan, 434, 429, 418], 
                        "Name": ["John", "Mitra", "Ritz", np.nan, "Anny", "Hema", np.nan], 
                        "Marks": [np.nan, 32, 25, np.nan, 35, 28, 38]
                })

test = data
data

Unnamed: 0,Roll_ID,Name,Marks
0,412.0,John,
1,,Mitra,32.0
2,456.0,Ritz,25.0
3,,,
4,434.0,Anny,35.0
5,429.0,Hema,28.0
6,418.0,,38.0


In [7]:
data.dropna(how = "all", inplace = True)
data

Unnamed: 0,Roll_ID,Name,Marks
0,412.0,John,
1,,Mitra,32.0
2,456.0,Ritz,25.0
4,434.0,Anny,35.0
5,429.0,Hema,28.0
6,418.0,,38.0


In [8]:
data.fillna(value = {"Roll_ID": 0, "Name": "Anonymous"}, inplace = True)
data

Unnamed: 0,Roll_ID,Name,Marks
0,412.0,John,
1,0.0,Mitra,32.0
2,456.0,Ritz,25.0
4,434.0,Anny,35.0
5,429.0,Hema,28.0
6,418.0,Anonymous,38.0


In [9]:
data["Marks"].fillna(data["Marks"].median(), inplace = True)
data

Unnamed: 0,Roll_ID,Name,Marks
0,412.0,John,32.0
1,0.0,Mitra,32.0
2,456.0,Ritz,25.0
4,434.0,Anny,35.0
5,429.0,Hema,28.0
6,418.0,Anonymous,38.0


In [10]:
data = data.astype({"Roll_ID": "int", "Marks": "int"})

In [11]:
data

Unnamed: 0,Roll_ID,Name,Marks
0,412,John,32
1,0,Mitra,32
2,456,Ritz,25
4,434,Anny,35
5,429,Hema,28
6,418,Anonymous,38


In [12]:
test

Unnamed: 0,Roll_ID,Name,Marks
0,412.0,John,32.0
1,0.0,Mitra,32.0
2,456.0,Ritz,25.0
4,434.0,Anny,35.0
5,429.0,Hema,28.0
6,418.0,Anonymous,38.0


In [13]:
test.isna().sum()

Roll_ID    0
Name       0
Marks      0
dtype: int64

In [14]:
test["Roll_ID"].isna().sum()

0

In [15]:
test.isna().sum().sum()

0

In [16]:
test.loc[[3]].isna().sum().sum()

KeyError: "None of [Int64Index([3], dtype='int64')] are in the [index]"

In [17]:
def fun(test):
    return test.fillna(1, inplace = True)

fun(test)

In [18]:
test

Unnamed: 0,Roll_ID,Name,Marks
0,412.0,John,32.0
1,0.0,Mitra,32.0
2,456.0,Ritz,25.0
4,434.0,Anny,35.0
5,429.0,Hema,28.0
6,418.0,Anonymous,38.0


In [19]:
test.loc[test["Name"].str.contains('n', na = False)].shape[0]

3

In [20]:
test["Name"].str.contains('n', na = False)

0     True
1    False
2    False
4     True
5    False
6     True
Name: Name, dtype: bool

In [21]:
str = pd.DataFrame({'access_id': {0: 'ORG6684',   1: '4564',   2: 'ORG6995',   3: '2130',   4: '5839',   5: 'ORG1281',   6: 'ORG2651',   7: 'ORG9870',   8: 'ORG4089',   9: 'ORG3794'}})

str

Unnamed: 0,access_id
0,ORG6684
1,4564
2,ORG6995
3,2130
4,5839
5,ORG1281
6,ORG2651
7,ORG9870
8,ORG4089
9,ORG3794


In [22]:
valid_id_mask = str["access_id"].str.startswith("ORG").count()
valid_id_mask

10

In [23]:
str.loc[valid_id_mask].access_id.count()

KeyError: 10

In [24]:
str.shape[0]

10

In [25]:
stu = pd.DataFrame({'Date':["2015-12-06", "2011-12-27", "2015-09-07", "2012-12-21", "2020-02-13", "2015-06-09"], 'RID':[498, 721, 375, 464, 813, 853], 'Phy':[22, 45, 1, 65, 22, 17], 'Chem':[52, 56, 32, 50, 24, 61], 'Math':[63, 37, 68, 62, 43 ,42]})
stu

Unnamed: 0,Date,RID,Phy,Chem,Math
0,2015-12-06,498,22,52,63
1,2011-12-27,721,45,56,37
2,2015-09-07,375,1,32,68
3,2012-12-21,464,65,50,62
4,2020-02-13,813,22,24,43
5,2015-06-09,853,17,61,42


In [26]:
stu

Unnamed: 0,Date,RID,Phy,Chem,Math
0,2015-12-06,498,22,52,63
1,2011-12-27,721,45,56,37
2,2015-09-07,375,1,32,68
3,2012-12-21,464,65,50,62
4,2020-02-13,813,22,24,43
5,2015-06-09,853,17,61,42


In [27]:
stu["Date"] = pd.to_datetime(stu["Date"])

In [28]:
stu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    6 non-null      datetime64[ns]
 1   RID     6 non-null      int64         
 2   Phy     6 non-null      int64         
 3   Chem    6 non-null      int64         
 4   Math    6 non-null      int64         
dtypes: datetime64[ns](1), int64(4)
memory usage: 368.0 bytes


In [29]:
stu["Date"] = stu["Date"].dt.month_name()

In [30]:
stu

Unnamed: 0,Date,RID,Phy,Chem,Math
0,December,498,22,52,63
1,December,721,45,56,37
2,September,375,1,32,68
3,December,464,65,50,62
4,February,813,22,24,43
5,June,853,17,61,42


In [31]:
stu["Date"].value_counts().sort_index()

December     3
February     1
June         1
September    1
Name: Date, dtype: int64

In [32]:
high_freq = stu["Date"].value_counts().values[0]
high_freq

3

In [33]:
month = stu["Date"].value_counts().index[0]

In [34]:
month

'December'

In [35]:
df_new = pd.DataFrame({"City\tState": ["Kolkata\tWestBengal", "Chennai\tTamil Nadu", "Hyderabad\tTelangana", "Banglore\tKarnataka"]})
df_new

Unnamed: 0,City\tState
0,Kolkata\tWestBengal
1,Chennai\tTamil Nadu
2,Hyderabad\tTelangana
3,Banglore\tKarnataka


In [36]:
df_new_split = df_new["City\tState"].str.split("\t", expand = True)
df_new_split.values

array([['Kolkata', 'WestBengal'],
       ['Chennai', 'Tamil Nadu'],
       ['Hyderabad', 'Telangana'],
       ['Banglore', 'Karnataka']], dtype=object)

In [37]:
# df_new_split.columns = df_new.columns[0].split("/t")
df_new_split = pd.DataFrame(df_new_split.values, columns = df_new.columns[0].split('\t'))
df_new_split

Unnamed: 0,City,State
0,Kolkata,WestBengal
1,Chennai,Tamil Nadu
2,Hyderabad,Telangana
3,Banglore,Karnataka


In [38]:
df_new["city"], df_new["State"] = df_new.columns[0].split('/t')

ValueError: not enough values to unpack (expected 2, got 1)

In [39]:
df_new

Unnamed: 0,City\tState
0,Kolkata\tWestBengal
1,Chennai\tTamil Nadu
2,Hyderabad\tTelangana
3,Banglore\tKarnataka


In [40]:
a, b, c = ["a", "b", "c"]

In [41]:
a

'a'

In [42]:
b

'b'

In [43]:
sales = pd.read_csv("https://d2beiqkhq929f0.cloudfront.net/public_assets/assets/000/003/717/original/sales_dataset.csv?1651728217")

sales.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom


In [44]:
sales["StockCode"].nunique()

5305

In [45]:
most_selling = sales["Description"].unique()
most_selling

array(['15CM CHRISTMAS GLASS BALL 20 LIGHTS', 'PINK CHERRY LIGHTS',
       ' WHITE CHERRY LIGHTS', ..., 'mixed up',
       'CREAM HANGING HEART T-LIGHT HOLDER',
       'PAPER CRAFT , LITTLE BIRDIE'], dtype=object)

In [46]:
sales.groupby(["Description"])["Quantity"].sum().sort_values()

Description
?                                     -26757
given away                            -20000
printing smudges/thrown away          -19200
missing                               -16467
Unsaleable, destroyed.                -15644
                                       ...  
BROCADE RING PURSE                     70700
JUMBO BAG RED RETROSPOT                78090
ASSORTED COLOUR BIRD ORNAMENT          81306
WHITE HANGING HEART T-LIGHT HOLDER     93050
WORLD WAR 2 GLIDERS ASSTD DESIGNS     108545
Name: Quantity, Length: 5698, dtype: int64

In [47]:
sales.sort_values(["Quantity"], ascending = False)

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
1065882,581483,23843,"PAPER CRAFT , LITTLE BIRDIE",80995,2011-12-09 09:15:00,2.08,16446.0,United Kingdom
587080,541431,23166,MEDIUM CERAMIC TOP STORAGE JAR,74215,2011-01-18 10:01:00,1.04,12346.0,United Kingdom
90857,497946,37410,BLACK AND WHITE PAISLEY FLOWER MUG,19152,2010-02-15 11:57:00,0.10,13902.0,Denmark
127168,501534,21091,SET/6 WOODLAND PAPER PLATES,12960,2010-03-17 13:09:00,0.10,13902.0,Denmark
127166,501534,21099,SET/6 STRAWBERRY PAPER CUPS,12960,2010-03-17 13:09:00,0.10,13902.0,Denmark
...,...,...,...,...,...,...,...,...
303996,519017,22759,,-9600,2010-08-13 09:14:00,0.00,,United Kingdom
750991,556691,23005,printing smudges/thrown away,-9600,2011-06-14 10:37:00,0.00,,United Kingdom
750990,556690,23005,printing smudges/thrown away,-9600,2011-06-14 10:37:00,0.00,,United Kingdom
587085,C541433,23166,MEDIUM CERAMIC TOP STORAGE JAR,-74215,2011-01-18 10:17:00,1.04,12346.0,United Kingdom


In [48]:
date = pd.to_datetime(sales["InvoiceDate"]).dt.date

In [49]:
pd.to_datetime(sales["InvoiceDate"]).dt.date

0          2009-12-01
1          2009-12-01
2          2009-12-01
3          2009-12-01
4          2009-12-01
              ...    
1067366    2011-12-09
1067367    2011-12-09
1067368    2011-12-09
1067369    2011-12-09
1067370    2011-12-09
Name: InvoiceDate, Length: 1067371, dtype: object

In [50]:
sales["Quantity"].dtype

dtype('int64')

In [51]:
new_sale = sales
new_sale

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.10,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom
...,...,...,...,...,...,...,...,...
1067366,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.10,12680.0,France
1067367,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France
1067368,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France
1067369,581587,22138,BAKING SET 9 PIECE RETROSPOT,3,2011-12-09 12:50:00,4.95,12680.0,France


In [52]:
new_sale["InvoiceDate"] = pd.to_datetime(sales["InvoiceDate"]).dt.date

In [53]:
new_sale

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01,2.10,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01,1.25,13085.0,United Kingdom
...,...,...,...,...,...,...,...,...
1067366,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09,2.10,12680.0,France
1067367,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09,4.15,12680.0,France
1067368,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09,4.15,12680.0,France
1067369,581587,22138,BAKING SET 9 PIECE RETROSPOT,3,2011-12-09,4.95,12680.0,France


In [54]:
# new_sale.loc[new_sale["Description"] == "JUMBO BAG OWLS	"]
# mask = new_sale["InvoiceDate"] == "2011-12-09"
# mask.any(axis = 0)
new_sale["InvoiceDate"].apply(lambda x: x == "2009-12-01")

0          False
1          False
2          False
3          False
4          False
           ...  
1067366    False
1067367    False
1067368    False
1067369    False
1067370    False
Name: InvoiceDate, Length: 1067371, dtype: bool

In [55]:
new_sale.tail(100)

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
1067271,581579,20713,JUMBO BAG OWLS,10,2011-12-09,1.79,17581.0,United Kingdom
1067272,581579,21931,JUMBO STORAGE BAG SUKI,10,2011-12-09,1.79,17581.0,United Kingdom
1067273,581579,23199,JUMBO BAG APPLES,30,2011-12-09,1.79,17581.0,United Kingdom
1067274,581579,23353,6 GIFT TAGS VINTAGE CHRISTMAS,12,2011-12-09,0.83,17581.0,United Kingdom
1067275,581579,23350,ROLL WRAP VINTAGE SPOT,25,2011-12-09,1.25,17581.0,United Kingdom
...,...,...,...,...,...,...,...,...
1067366,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09,2.10,12680.0,France
1067367,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09,4.15,12680.0,France
1067368,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09,4.15,12680.0,France
1067369,581587,22138,BAKING SET 9 PIECE RETROSPOT,3,2011-12-09,4.95,12680.0,France


In [56]:
sales

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01,2.10,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01,1.25,13085.0,United Kingdom
...,...,...,...,...,...,...,...,...
1067366,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09,2.10,12680.0,France
1067367,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09,4.15,12680.0,France
1067368,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09,4.15,12680.0,France
1067369,581587,22138,BAKING SET 9 PIECE RETROSPOT,3,2011-12-09,4.95,12680.0,France


In [61]:
# sales['InvoiceDate']=pd.to_datetime(sales['InvoiceDate'])
# sales['Date']=pd.to_datetime(sales['InvoiceDate'].dt.date)

ans=sales[sales['Date']=='2011-06-22']
ans['Customer ID'].nunique()

# sales

57

In [62]:
sales

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,Date
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01,6.95,13085.0,United Kingdom,2009-12-01
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01,6.75,13085.0,United Kingdom,2009-12-01
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01,6.75,13085.0,United Kingdom,2009-12-01
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01,2.10,13085.0,United Kingdom,2009-12-01
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01,1.25,13085.0,United Kingdom,2009-12-01
...,...,...,...,...,...,...,...,...,...
1067366,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09,2.10,12680.0,France,2011-12-09
1067367,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09,4.15,12680.0,France,2011-12-09
1067368,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09,4.15,12680.0,France,2011-12-09
1067369,581587,22138,BAKING SET 9 PIECE RETROSPOT,3,2011-12-09,4.95,12680.0,France,2011-12-09


In [79]:
# sales["total_price"] = sales["Quantity"] * sales["Price"]
# sales

sales_country = sales.groupby(["Country", "Date"])["total_price"].sum().sort_values(ascending = False)

In [80]:
sales_country = sales_country.reset_index()
sales_country[sales_country["Country"] == "France"]

Unnamed: 0,Country,Date,total_price
563,France,2010-08-09,12726.01
608,France,2011-10-28,8990.46
618,France,2010-05-06,8490.90
667,France,2011-03-31,5593.59
687,France,2010-11-19,5084.89
...,...,...,...
3184,France,2009-12-10,-118.90
3190,France,2009-12-03,-143.90
3208,France,2010-05-28,-364.80
3211,France,2010-08-05,-421.40
