In [1]:
import pandas as pd

# Load the datasets
test_df = pd.read_csv('test.csv')
train_df = pd.read_csv('train.csv')

# Count occurrences of each category/subcategory combination
train_counts = train_df.groupby(['category', 'sub_category']).size().reset_index(name='count')
test_counts = test_df.groupby(['category', 'sub_category']).size().reset_index(name='count')

# Compare the counts
train_counts = train_counts.set_index(['category', 'sub_category'])
test_counts = test_counts.set_index(['category', 'sub_category'])

# Create comparison dataframe
comparison = pd.concat([train_counts, test_counts], 
                      axis=1, 
                      keys=['train', 'test'], 
                      join='outer')
comparison.columns = ['train_count', 'test_count']

# Identify missing/extra categories
comparison['train_extra'] = comparison['train_count'].isnull() | (comparison['train_count'] == 0)
comparison['test_extra'] = comparison['test_count'].isnull() | (comparison['test_count'] == 0)

# Reset index for better readability
comparison = comparison.reset_index()

# Display results
print(comparison)

                                          category  \
0                            Any Other Cyber Crime   
1                             Cryptocurrency Crime   
2                   Cyber Attack/ Dependent Crimes   
3                   Cyber Attack/ Dependent Crimes   
4                   Cyber Attack/ Dependent Crimes   
5                   Cyber Attack/ Dependent Crimes   
6                   Cyber Attack/ Dependent Crimes   
7                   Cyber Attack/ Dependent Crimes   
8                   Cyber Attack/ Dependent Crimes   
9                                  Cyber Terrorism   
10  Hacking  Damage to computercomputer system etc   
11  Hacking  Damage to computercomputer system etc   
12  Hacking  Damage to computercomputer system etc   
13  Hacking  Damage to computercomputer system etc   
14  Hacking  Damage to computercomputer system etc   
15                        Online Cyber Trafficking   
16                          Online Financial Fraud   
17                          

In [2]:
import pandas as pd
import numpy as np
from collections import Counter

# Load the datasets with proper error handling
def load_data(file_path, dataset_name):
    try:
        df = pd.read_csv(file_path)
        print(f"\n{dataset_name} Dataset loaded successfully")
        print(f"Shape: {df.shape}")
        return df
    except Exception as e:
        print(f"Error loading {dataset_name} dataset: {e}")
        return None

# Function to analyze categories
def analyze_categories(df, dataset_name):
    print(f"\n{dataset_name} Dataset Analysis:")
    print("-" * 50)
    
    # Count unique categories
    unique_cats = df['category'].nunique()
    unique_subcats = df['sub_category'].nunique()
    
    print(f"Unique Categories: {unique_cats}")
    print(f"Unique Sub-categories: {unique_subcats}")
    
    # Get value counts
    cat_counts = df['category'].value_counts()
    subcat_counts = df['sub_category'].value_counts()
    
    return cat_counts, subcat_counts

# Function to compare datasets
def compare_datasets(train_df, test_df):
    print("\nDataset Comparison:")
    print("-" * 50)
    
    # Create comparison dataframes for categories
    train_cats = set(train_df['category'].unique())
    test_cats = set(test_df['category'].unique())
    
    # Find unique categories in each dataset
    unique_to_train = train_cats - test_cats
    unique_to_test = test_cats - train_cats
    common_cats = train_cats.intersection(test_cats)
    
    # Create detailed comparison for subcategories
    train_pairs = set(zip(train_df['category'], train_df['sub_category']))
    test_pairs = set(zip(test_df['category'], test_df['sub_category']))
    
    # Compare category-subcategory pairs
    unique_to_train_pairs = train_pairs - test_pairs
    unique_to_test_pairs = test_pairs - train_pairs
    
    return {
        'unique_to_train': unique_to_train,
        'unique_to_test': unique_to_test,
        'common_cats': common_cats,
        'unique_train_pairs': unique_to_train_pairs,
        'unique_test_pairs': unique_to_test_pairs
    }

# Function to create detailed analysis report
def create_analysis_report(train_df, test_df, comparison_results):
    # Create a DataFrame for side-by-side comparison
    comparison_df = pd.DataFrame({
        'Train_Categories': pd.Series(train_df['category'].value_counts()),
        'Test_Categories': pd.Series(test_df['category'].value_counts())
    }).fillna(0)
    
    # Calculate percentages
    comparison_df['Train_Percentage'] = (comparison_df['Train_Categories'] / 
                                       len(train_df) * 100).round(2)
    comparison_df['Test_Percentage'] = (comparison_df['Test_Categories'] / 
                                      len(test_df) * 100).round(2)
    
    return comparison_df

# Main execution
def main():
    # Load datasets
    train_df = load_data('train.csv', 'Training')
    test_df = load_data('test.csv', 'Testing')
    
    if train_df is None or test_df is None:
        return
    
    # Analyze individual datasets
    train_cats, train_subcats = analyze_categories(train_df, 'Training')
    test_cats, test_subcats = analyze_categories(test_df, 'Testing')
    
    # Compare datasets
    comparison_results = compare_datasets(train_df, test_df)
    
    # Create detailed analysis report
    analysis_df = create_analysis_report(train_df, test_df, comparison_results)
    
    # Print detailed results
    print("\nDetailed Category Analysis:")
    print(analysis_df)
    
    print("\nUnique Category-Subcategory Combinations:")
    print("\nUnique to Training Dataset:")
    for cat, subcat in comparison_results['unique_train_pairs']:
        print(f"Category: {cat} | Subcategory: {subcat}")
    
    print("\nUnique to Testing Dataset:")
    for cat, subcat in comparison_results['unique_test_pairs']:
        print(f"Category: {cat} | Subcategory: {subcat}")
    
    # Save results to CSV
    analysis_df.to_csv('category_analysis_report.csv')

if __name__ == "__main__":
    main()


Training Dataset loaded successfully
Shape: (93686, 3)

Testing Dataset loaded successfully
Shape: (31229, 3)

Training Dataset Analysis:
--------------------------------------------------
Unique Categories: 15
Unique Sub-categories: 35

Testing Dataset Analysis:
--------------------------------------------------
Unique Categories: 15
Unique Sub-categories: 37

Dataset Comparison:
--------------------------------------------------

Detailed Category Analysis:
                                                    Train_Categories  \
category                                                               
Any Other Cyber Crime                                        10878.0   
Child Pornography CPChild Sexual Abuse Material...             379.0   
Crime Against Women & Children                                   0.0   
Cryptocurrency Crime                                           480.0   
Cyber Attack/ Dependent Crimes                                3608.0   
Cyber Terrorism                

In [7]:
import pandas as pd

def analyze_categories(train_path, test_path):
    # Load datasets
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    
    print("Train Dataset Category-Subcategory Combinations:")
    print("-" * 50)
    
    # Get unique combinations from train
    train_combinations = train_df.groupby(['category', 'sub_category']).size().reset_index()
    for _, row in train_combinations.iterrows():
        if pd.isna(row['sub_category']):
            print(f"Category: {row['category']} | Subcategory: None")
        else:
            print(f"Category: {row['category']} | Subcategory: {row['sub_category']}")
    
    print("\nTest Dataset Category-Subcategory Combinations:")
    print("-" * 50)
    
    # Get unique combinations from test
    test_combinations = test_df.groupby(['category', 'sub_category']).size().reset_index()
    for _, row in test_combinations.iterrows():
        if pd.isna(row['sub_category']):
            print(f"Category: {row['category']} | Subcategory: None")
        else:
            print(f"Category: {row['category']} | Subcategory: {row['sub_category']}")
    
    # Save to file
    with open('category_subcategory_combinations.txt', 'w') as f:
        f.write("Train Dataset Combinations:\n")
        f.write("-" * 50 + "\n")
        for _, row in train_combinations.iterrows():
            if pd.isna(row['sub_category']):
                f.write(f"Category: {row['category']} | Subcategory: None\n")
            else:
                f.write(f"Category: {row['category']} | Subcategory: {row['sub_category']}\n")
        
        f.write("\nTest Dataset Combinations:\n")
        f.write("-" * 50 + "\n")
        for _, row in test_combinations.iterrows():
            if pd.isna(row['sub_category']):
                f.write(f"Category: {row['category']} | Subcategory: None\n")
            else:
                f.write(f"Category: {row['category']} | Subcategory: {row['sub_category']}\n")

if __name__ == "__main__":
    train_path = 'train.csv'
    test_path = 'test.csv'
    analyze_categories(train_path, test_path)

Train Dataset Category-Subcategory Combinations:
--------------------------------------------------
Category: Any Other Cyber Crime | Subcategory: Other
Category: Cryptocurrency Crime | Subcategory: Cryptocurrency Fraud
Category: Cyber Attack/ Dependent Crimes | Subcategory: Data Breach/Theft
Category: Cyber Attack/ Dependent Crimes | Subcategory: Denial of Service (DoS)/Distributed Denial of Service (DDOS) attacks
Category: Cyber Attack/ Dependent Crimes | Subcategory: Hacking/Defacement
Category: Cyber Attack/ Dependent Crimes | Subcategory: Malware Attack
Category: Cyber Attack/ Dependent Crimes | Subcategory: Ransomware Attack
Category: Cyber Attack/ Dependent Crimes | Subcategory: SQL Injection
Category: Cyber Attack/ Dependent Crimes | Subcategory: Tampering with computer source documents
Category: Cyber Terrorism | Subcategory: Cyber Terrorism
Category: Hacking  Damage to computercomputer system etc | Subcategory: Damage to computer computer systems etc
Category: Hacking  Damage

In [1]:
import pandas as pd

def analyze_categories(train_path, test_path):
    # Load datasets
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    
    print("Train Dataset Category-Subcategory Combinations:")
    print("-" * 50)
    
    # Get unique combinations from train
    train_combinations = train_df.groupby(['category', 'sub_category']).size().reset_index()
    for _, row in train_combinations.iterrows():
        if pd.isna(row['sub_category']):
            print(f"Category: {row['category']} | Subcategory: None")
        else:
            print(f"Category: {row['category']} | Subcategory: {row['sub_category']}")
    
    print("\nTest Dataset Category-Subcategory Combinations:")
    print("-" * 50)
    
    # Get unique combinations from test
    test_combinations = test_df.groupby(['category', 'sub_category']).size().reset_index()
    for _, row in test_combinations.iterrows():
        if pd.isna(row['sub_category']):
            print(f"Category: {row['category']} | Subcategory: None")
        else:
            print(f"Category: {row['category']} | Subcategory: {row['sub_category']}")
    
    # Save to file
    with open('category_subcategory_combinations.txt', 'w') as f:
        f.write("Train Dataset Combinations:\n")
        f.write("-" * 50 + "\n")
        for _, row in train_combinations.iterrows():
            if pd.isna(row['sub_category']):
                f.write(f"Category: {row['category']} | Subcategory: None\n")
            else:
                f.write(f"Category: {row['category']} | Subcategory: {row['sub_category']}\n")
        
        f.write("\nTest Dataset Combinations:\n")
        f.write("-" * 50 + "\n")
        for _, row in test_combinations.iterrows():
            if pd.isna(row['sub_category']):
                f.write(f"Category: {row['category']} | Subcategory: None\n")
            else:
                f.write(f"Category: {row['category']} | Subcategory: {row['sub_category']}\n")

if __name__ == "__main__":
    train_path = 'train.csv'
    test_path = 'test.csv'
    analyze_categories(train_path, test_path)

Train Dataset Category-Subcategory Combinations:
--------------------------------------------------
Category: Any Other Cyber Crime | Subcategory: Other
Category: Cryptocurrency Crime | Subcategory: Cryptocurrency Fraud
Category: Cyber Attack/ Dependent Crimes | Subcategory: Data Breach/Theft
Category: Cyber Attack/ Dependent Crimes | Subcategory: Denial of Service (DoS)/Distributed Denial of Service (DDOS) attacks
Category: Cyber Attack/ Dependent Crimes | Subcategory: Hacking/Defacement
Category: Cyber Attack/ Dependent Crimes | Subcategory: Malware Attack
Category: Cyber Attack/ Dependent Crimes | Subcategory: Ransomware Attack
Category: Cyber Attack/ Dependent Crimes | Subcategory: SQL Injection
Category: Cyber Attack/ Dependent Crimes | Subcategory: Tampering with computer source documents
Category: Cyber Terrorism | Subcategory: Cyber Terrorism
Category: Hacking  Damage to computercomputer system etc | Subcategory: Damage to computer computer systems etc
Category: Hacking  Damage

In [2]:
import pandas as pd

def combine_category_subcategories(train_path, test_path):
    # Load both datasets
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    
    # Combine datasets
    combined_df = pd.concat([train_df, test_df], ignore_index=True)
    
    # Get unique category-subcategory pairs with counts
    category_pairs = combined_df.groupby(['category', 'sub_category']).size().reset_index(name='count')
    
    # Sort by category and count
    category_pairs = category_pairs.sort_values(['category', 'count'], ascending=[True, False])
    
    # Replace NaN subcategories with 'None'
    category_pairs['sub_category'] = category_pairs['sub_category'].fillna('None')
    
    return category_pairs

# File paths
train_path = 'train.csv'
test_path = 'test.csv'

# Get combined list
complete_list = combine_category_subcategories(train_path, test_path)

# Print formatted output
print("\nComplete Category-Subcategory Pairs:")
print("-" * 50)
current_category = None

for _, row in complete_list.iterrows():
    if current_category != row['category']:
        current_category = row['category']
        print(f"\n{current_category}:")
    print(f"- {row['sub_category']} (Count: {row['count']})")


Complete Category-Subcategory Pairs:
--------------------------------------------------

Any Other Cyber Crime:
- Other (Count: 14548)

Crime Against Women & Children:
- Computer Generated CSAM/CSEM (Count: 2)
- Cyber Blackmailing & Threatening (Count: 1)
- Sexual Harassment (Count: 1)

Cryptocurrency Crime:
- Cryptocurrency Fraud (Count: 646)

Cyber Attack/ Dependent Crimes:
- Hacking/Defacement (Count: 740)
- Ransomware Attack (Count: 720)
- Tampering with computer source documents (Count: 697)
- Denial of Service (DoS)/Distributed Denial of Service (DDOS) attacks (Count: 691)
- Malware Attack (Count: 691)
- SQL Injection (Count: 675)
- Data Breach/Theft (Count: 655)

Cyber Terrorism:
- Cyber Terrorism (Count: 213)

Hacking  Damage to computercomputer system etc:
- Unauthorised AccessData Breach (Count: 1484)
- Email Hacking (Count: 479)
- Damage to computer computer systems etc (Count: 147)
- Website DefacementHacking (Count: 128)
- Tampering with computer source documents (Count: 

Category: Any Other Cyber Crime | Subcategory: Other
Category: Crime Against Women & Children | Subcategory: Computer Generated CSAM/CSEM
Category: Crime Against Women & Children | Subcategory: Cyber Blackmailing & Threatening
Category: Crime Against Women & Children | Subcategory: Sexual Harassment
Category: Cryptocurrency Crime | Subcategory: Cryptocurrency Fraud
Category: Cyber Attack/ Dependent Crimes | Subcategory: Data Breach/Theft
Category: Cyber Attack/ Dependent Crimes | Subcategory: Denial of Service (DoS)/Distributed Denial of Service (DDOS) attacks
Category: Cyber Attack/ Dependent Crimes | Subcategory: Hacking/Defacement
Category: Cyber Attack/ Dependent Crimes | Subcategory: Malware Attack
Category: Cyber Attack/ Dependent Crimes | Subcategory: Ransomware Attack
Category: Cyber Attack/ Dependent Crimes | Subcategory: SQL Injection
Category: Cyber Attack/ Dependent Crimes | Subcategory: Tampering with computer source documents
Category: Cyber Terrorism | Subcategory: Cyber Terrorism
Category: Hacking  Damage to computercomputer system etc | Subcategory: Damage to computer computer systems etc
Category: Hacking  Damage to computercomputer system etc | Subcategory: Email Hacking
Category: Hacking  Damage to computercomputer system etc | Subcategory: Tampering with computer source documents
Category: Hacking  Damage to computercomputer system etc | Subcategory: Unauthorised AccessData Breach
Category: Hacking  Damage to computercomputer system etc | Subcategory: Website DefacementHacking
Category: Online Cyber Trafficking | Subcategory: Online Trafficking
Category: Online Financial Fraud | Subcategory: Business Email CompromiseEmail Takeover
Category: Online Financial Fraud | Subcategory: DebitCredit Card FraudSim Swap Fraud
Category: Online Financial Fraud | Subcategory: DematDepository Fraud
Category: Online Financial Fraud | Subcategory: EWallet Related Fraud
Category: Online Financial Fraud | Subcategory: Fraud CallVishing
Category: Online Financial Fraud | Subcategory: Internet Banking Related Fraud
Category: Online Financial Fraud | Subcategory: UPI Related Frauds
Category: Online Gambling  Betting | Subcategory: Online Gambling  Betting
Category: Online and Social Media Related Crime | Subcategory: Cheating by Impersonation
Category: Online and Social Media Related Crime | Subcategory: Cyber Bullying  Stalking  Sexting
Category: Online and Social Media Related Crime | Subcategory: EMail Phishing
Category: Online and Social Media Related Crime | Subcategory: FakeImpersonating Profile
Category: Online and Social Media Related Crime | Subcategory: Impersonating Email
Category: Online and Social Media Related Crime | Subcategory: Intimidating Email
Category: Online and Social Media Related Crime | Subcategory: Online Job Fraud
Category: Online and Social Media Related Crime | Subcategory: Online Matrimonial Fraud
Category: Online and Social Media Related Crime | Subcategory: Profile Hacking Identity Theft
Category: Online and Social Media Related Crime | Subcategory: Provocative Speech for unlawful acts
Category: Ransomware | Subcategory: Ransomware
Category: Report Unlawful Content | Subcategory: Against Interest of sovereignty or integrity of India