In [1]:
import pandas as pd

# 1. Read the dataset (for example, a CSV file)
data = pd.read_csv("1.csv")

# 2. Display the first few rows
print("First 5 rows:")
print(data.head())

# 3. Show basic information about the dataset
print("\nDataset Info:")
print(data.info())

# 4. Display basic statistics
print("\nSummary Statistics:")
print(data.describe())

# 5. Check for missing values
print("\nMissing Values:")
print(data.isnull().sum())

# 6. Check for duplicate records
print("\nDuplicate Records:")
print(data.duplicated().sum())

# 7. Display column names
print("\nColumns in the dataset:")
print(data.columns)


First 5 rows:
               State  Year  Carbon_Emissions_MtCO2  GDP_BillionINR  \
0     Andhra Pradesh  1980                   55.71         4740.15   
1  Arunachal Pradesh  1980                  191.56        32180.58   
2              Assam  1980                   20.66        24293.30   
3              Bihar  1980                  168.72        24800.70   
4       Chhattisgarh  1980                  238.63         6955.01   

   Urbanization_Percent  Energy_Use_TWh  
0                 31.68           29.78  
1                 52.51          178.61  
2                 42.56           15.13  
3                 42.46           84.78  
4                 38.83          209.29  

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1260 entries, 0 to 1259
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   State                   1260 non-null   object 
 1   Year                    1260

In [3]:
import pandas as pd

# 1. Read the dataset
data = pd.read_csv("1.csv")

# 2. Check missing values
print("Missing values before cleaning:")
print(data.isnull().sum())

# 3. Handle missing values
# Option 1: Fill missing values with mean, median, or mode (depending on the column)
data = data.fillna(data.mean(numeric_only=True))  # for numeric columns
# OR you can drop rows with missing values:
# data = data.dropna()

# 4. Remove duplicate records
data = data.drop_duplicates()

# 5. Verify cleaning results
print("\nMissing values after cleaning:")
print(data.isnull().sum())

print("\nNumber of duplicate records after cleaning:")
print(data.duplicated().sum())

# 6. Optional: Save the cleaned dataset
data.to_csv("cleaned_dataset.csv", index=False)
print("\nCleaned dataset saved as 'cleaned_dataset.csv'")



Missing values before cleaning:
State                     0
Year                      0
Carbon_Emissions_MtCO2    0
GDP_BillionINR            0
Urbanization_Percent      0
Energy_Use_TWh            0
dtype: int64

Missing values after cleaning:
State                     0
Year                      0
Carbon_Emissions_MtCO2    0
GDP_BillionINR            0
Urbanization_Percent      0
Energy_Use_TWh            0
dtype: int64

Number of duplicate records after cleaning:
0

Cleaned dataset saved as 'cleaned_dataset.csv'


In [5]:
import pandas as pd

# 1. Read the dataset
data = pd.read_csv("1.csv")

# 2. Example Data Transformations
# a) Create a new column (e.g., total_score = score1 + score2)
if 'score1' in data.columns and 'score2' in data.columns:
    data['total_score'] = data['score1'] + data['score2']

# b) Convert column data types (e.g., 'date' column to datetime)
if 'date' in data.columns:
    data['date'] = pd.to_datetime(data['date'])

# c) Rename columns for clarity
data = data.rename(columns={'old_column_name': 'new_column_name'})

# d) Filter rows (e.g., only include records where total_score > 50)
if 'total_score' in data.columns:
    filtered_data = data[data['total_score'] > 50]

# 3. Basic Statistical Analysis
print("Summary Statistics:")
print(data.describe())

# Correlation between numeric columns
print("\nCorrelation Matrix:")
print(data.select_dtypes(include=['number']).corr())

# Count of unique values in a categorical column
if 'category_column' in data.columns:
    print("\nValue Counts in 'category_column':")
    print(data['category_column'].value_counts())

# 4. Optional: Group by and aggregate
# Example: average total_score by category
if 'category_column' in data.columns and 'total_score' in data.columns:
    grouped = data.groupby('category_column')['total_score'].mean()
    print("\nAverage total_score by category:")
    print(grouped)

Summary Statistics:
             Year  Carbon_Emissions_MtCO2  GDP_BillionINR  \
count  1260.00000             1260.000000     1260.000000   
mean   2002.00000              157.222508    24319.898389   
std      12.99233               70.856318    10557.311870   
min    1980.00000               15.070000     3206.290000   
25%    1991.00000               98.267500    15944.387500   
50%    2002.00000              156.295000    24398.290000   
75%    2013.00000              217.332500    32409.655000   
max    2024.00000              328.600000    54168.320000   

       Urbanization_Percent  Energy_Use_TWh  
count           1260.000000     1260.000000  
mean              41.424087      134.045802  
std               13.360824       70.879545  
min               16.020000       10.640000  
25%               29.522500       78.667500  
50%               41.785000      124.335000  
75%               52.802500      183.715000  
max               71.450000      372.120000  

Correlation Mat