## Generating a Sample CSV File

In [1]:
import pandas as pd
from datetime import datetime, timedelta
import random

# Generate random data
dates = [datetime.now().date() - timedelta(days=i) for i in range(10)]
product_ids = [101, 102, 103, 104, 105]
data = []

for date in dates:
    for product_id in product_ids:
        quantity_sold = random.randint(1, 50)
        unit_price = round(random.uniform(10, 100), 2)
        total_sales = round(quantity_sold * unit_price, 2)
        data.append([date, product_id, quantity_sold, unit_price, total_sales])

# Create DataFrame and save to CSV
df = pd.DataFrame(data, columns=['date', 'product_id', 'quantity_sold', 'unit_price', 'total_sales'])
df.to_csv('sample_sales_data.csv', index=False)

print("Sample CSV file 'sample_sales_data.csv' generated successfully.")


Sample CSV file 'sample_sales_data.csv' generated successfully.


## Reading CSV Files

In [2]:
# Import pandas and load CSV file
import pandas as pd

# Read CSV file
df = pd.read_csv('sample_sales_data.csv')

# Get the first few rows
print(df.head())


         date  product_id  quantity_sold  unit_price  total_sales
0  2024-11-07         101             31       15.63       484.53
1  2024-11-07         102             39       84.24      3285.36
2  2024-11-07         103             12       34.36       412.32
3  2024-11-07         104             16       84.06      1344.96
4  2024-11-07         105             50       86.64      4332.00


## Exploring and Summarizing Data

In [3]:
# Get info about the dataframe
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           50 non-null     object 
 1   product_id     50 non-null     int64  
 2   quantity_sold  50 non-null     int64  
 3   unit_price     50 non-null     float64
 4   total_sales    50 non-null     float64
dtypes: float64(2), int64(2), object(1)
memory usage: 2.1+ KB
None


In [4]:
# Get descriptive statistics
print(df.describe())

       product_id  quantity_sold  unit_price  total_sales
count   50.000000      50.000000   50.000000    50.000000
mean   103.000000      27.180000   58.599400  1559.238800
std      1.428571      15.364536   25.783317  1214.010033
min    101.000000       2.000000   12.220000   107.740000
25%    102.000000      12.000000   34.322500   683.360000
50%    103.000000      29.500000   60.145000  1179.510000
75%    104.000000      39.750000   83.177500  2209.410000
max    105.000000      50.000000   99.830000  4382.000000


## Handling Missing Values

In [5]:
# Check for missing values
missing_values = df.isna().sum()
print(missing_values)

date             0
product_id       0
quantity_sold    0
unit_price       0
total_sales      0
dtype: int64


## Basic Data Manipulation

In [6]:
# Filter rows where quantity_sold is greater than 20
high_sales = df[df['quantity_sold'] > 20]
print(high_sales)

          date  product_id  quantity_sold  unit_price  total_sales
0   2024-11-07         101             31       15.63       484.53
1   2024-11-07         102             39       84.24      3285.36
4   2024-11-07         105             50       86.64      4332.00
6   2024-11-06         102             36       26.02       936.72
7   2024-11-06         103             46       62.21      2861.66
8   2024-11-06         104             36       32.02      1152.72
9   2024-11-06         105             37       22.46       831.02
11  2024-11-05         102             32       50.31      1609.92
15  2024-11-04         101             43       90.23      3879.89
16  2024-11-04         102             49       28.86      1414.14
17  2024-11-04         103             48       45.57      2187.36
20  2024-11-03         101             33       52.68      1738.44
21  2024-11-03         102             33       59.96      1978.68
23  2024-11-03         104             40       78.17      312

In [7]:
# Select specific columns
selected_columns = df[['date', 'product_id', 'total_sales']]
print(selected_columns)


          date  product_id  total_sales
0   2024-11-07         101       484.53
1   2024-11-07         102      3285.36
2   2024-11-07         103       412.32
3   2024-11-07         104      1344.96
4   2024-11-07         105      4332.00
5   2024-11-06         101       816.10
6   2024-11-06         102       936.72
7   2024-11-06         103      2861.66
8   2024-11-06         104      1152.72
9   2024-11-06         105       831.02
10  2024-11-05         101       121.92
11  2024-11-05         102      1609.92
12  2024-11-05         103       798.64
13  2024-11-05         104       843.26
14  2024-11-05         105       357.55
15  2024-11-04         101      3879.89
16  2024-11-04         102      1414.14
17  2024-11-04         103      2187.36
18  2024-11-04         104       619.40
19  2024-11-04         105      1192.48
20  2024-11-03         101      1738.44
21  2024-11-03         102      1978.68
22  2024-11-03         103       583.60
23  2024-11-03         104      3126.80


In [8]:
# Add a new column for discounted sales
df['discounted_sales'] = df['total_sales'] * 0.9
print(df.head())


         date  product_id  quantity_sold  unit_price  total_sales  \
0  2024-11-07         101             31       15.63       484.53   
1  2024-11-07         102             39       84.24      3285.36   
2  2024-11-07         103             12       34.36       412.32   
3  2024-11-07         104             16       84.06      1344.96   
4  2024-11-07         105             50       86.64      4332.00   

   discounted_sales  
0           436.077  
1          2956.824  
2           371.088  
3          1210.464  
4          3898.800  


## Grouping and Aggregating Data

In [9]:
# Calculate total sales by product_id
total_sales_by_product = df.groupby('product_id')['total_sales'].sum().reset_index()
print(total_sales_by_product)


   product_id  total_sales
0         101     14643.82
1         102     18105.92
2         103     18769.36
3         104     15045.25
4         105     11397.59


In [10]:
# Calculate total and average sales grouped by product_id
sales_summary = df.groupby('product_id').agg(total_sales=('total_sales', 'sum'), average_sales=('total_sales', 'mean')).reset_index()
print(sales_summary)


   product_id  total_sales  average_sales
0         101     14643.82       1464.382
1         102     18105.92       1810.592
2         103     18769.36       1876.936
3         104     15045.25       1504.525
4         105     11397.59       1139.759
