In [None]:
pip install pandas

## Read CSV file and print Dataframe

In [1]:
import pandas as pd
from scipy.stats import zscore

# Load the CSV file
df = pd.read_csv('test.csv')

In [2]:
# Display basic information and identify issues
print("Dataframe Info:")
print(df.info())
print("\nFirst few rows of the dataframe:")
print(df.head())

Dataframe Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206 entries, 0 to 205
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   User_ID                     103 non-null    float64
 1   Age                         103 non-null    object 
 2   Gender                      103 non-null    object 
 3   Platform                    103 non-null    object 
 4   Daily_Usage_Time (minutes)  103 non-null    float64
 5   Posts_Per_Day               103 non-null    float64
 6   Likes_Received_Per_Day      103 non-null    float64
 7   Comments_Received_Per_Day   103 non-null    float64
 8   Messages_Sent_Per_Day       103 non-null    float64
 9   Dominant_Emotion            103 non-null    object 
dtypes: float64(6), object(4)
memory usage: 16.2+ KB
None

First few rows of the dataframe:
   User_ID  Age      Gender  Platform  Daily_Usage_Time (minutes)  \
0      NaN  NaN         NaN       

## Data Cleaning Script for Social Media User Data 

# This script cleans a CSV file containing social media user data. It performs the following tasks:

Load Data: Imports the pandas library and reads the CSV file into a DataFrame named df.
Data Exploration:
Prints basic information about the DataFrame using .info().
Prints the first few rows using .head() to identify potential issues.
# Missing Values:
Prints the number of missing values per column using .isnull().sum().
Drops rows with any missing values using .dropna() and stores the cleaned data in df_cleaned.
Data Type Conversion:
Iterates through specific columns and attempts to convert their values to numeric data types using .to_numeric().
Uses the errors='coerce' argument to handle non-numeric values by converting them to NaN (Not a Number).
Updates the data types in-place using .loc for better memory efficiency.
Drops rows with conversion errors using .dropna().
# Duplicate Removal:
Removes duplicate rows from df_cleaned using .drop_duplicates().
# Outlier Detection and Removal:
Imports the zscore function from scipy.stats for z-score calculation.
Defines a function remove_outliers that calculates z-scores for a specified column.
# Filters the DataFrame to keep rows with z-scores within 3 standard deviations of the mean, effectively removing outliers.
Drops the temporary zscore column after filtering.
Applies the remove_outliers function to relevant columns containing numeric data (e.g., Daily_Usage_Time, Posts_Per_Day).
# Saving Cleaned Data:
Saves the cleaned DataFrame df_cleaned to a new CSV file named cleaned_file.csv using .to_csv().
Sets index=False to avoid saving the row index in the output file.
Confirmation:
Prints basic information about the cleaned DataFrame using .info() for verification.

In [None]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('./test.csv')

# Display basic information and identify issues
print("Dataframe Info:")
print(df.info())
print("\nFirst few rows of the dataframe:")
print(df.head())

# Check for missing values
print("\nMissing values per column:")
print(df.isnull().sum())

# Drop rows with any missing values
df_cleaned = df.dropna()

# Convert columns to appropriate data types using .loc
df_cleaned.loc[:, 'Age'] = pd.to_numeric(df_cleaned['Age'], errors='coerce')
df_cleaned.loc[:, 'Daily_Usage_Time (minutes)'] = pd.to_numeric(df_cleaned['Daily_Usage_Time (minutes)'], errors='coerce')
df_cleaned.loc[:, 'Posts_Per_Day'] = pd.to_numeric(df_cleaned['Posts_Per_Day'], errors='coerce')
df_cleaned.loc[:, 'Likes_Received_Per_Day'] = pd.to_numeric(df_cleaned['Likes_Received_Per_Day'], errors='coerce')
df_cleaned.loc[:, 'Comments_Received_Per_Day'] = pd.to_numeric(df_cleaned['Comments_Received_Per_Day'], errors='coerce')
df_cleaned.loc[:, 'Messages_Sent_Per_Day'] = pd.to_numeric(df_cleaned['Messages_Sent_Per_Day'], errors='coerce')

# Drop rows with conversion errors
df_cleaned = df_cleaned.dropna()

# Remove duplicate rows
df_cleaned = df_cleaned.drop_duplicates()

# Define a function to remove outliers based on z-score
from scipy.stats import zscore

def remove_outliers(df, column):
    df['zscore'] = zscore(df[column])
    df = df[abs(df['zscore']) < 3]
    df = df.drop(columns=['zscore'])
    return df

# Apply the function to relevant columns
df_cleaned = remove_outliers(df_cleaned, 'Daily_Usage_Time (minutes)')
df_cleaned = remove_outliers(df_cleaned, 'Posts_Per_Day')
df_cleaned = remove_outliers(df_cleaned, 'Likes_Received_Per_Day')
df_cleaned = remove_outliers(df_cleaned, 'Comments_Received_Per_Day')
df_cleaned = remove_outliers(df_cleaned, 'Messages_Sent_Per_Day')

# Save the cleaned dataframe to a new CSV file
df_cleaned.to_csv('cleaned_file.csv', index=False)

print("\nDataframe after cleaning and saving:")
print(df_cleaned.info())


In [None]:
!pip install mrjob pandas


In [None]:
pip install mrjob


# **Filtering Data:**

## Filter rows based on specific conditions using the filter function:

In [None]:
filtered_data = filtered_df.filter(data_df.Age > 25)  # Select rows where Age is greater than 25
filtered_data.show()

In [None]:
## select specified column and grouping and aggregation

In [None]:
# Select specific columns
selected_data = filtered_data.select("User_ID","Daily_Usage_Time (minutes)","Likes_Received_Per_Day")
print("Selected Columns User_ID,Daily_Usage_Time (minutes), Likes_Received_Per_Day")
selected_data.show()

In [None]:
# Group By and Aggregation (replace columns as needed)
avg_usage_by_platform = filtered_data.groupBy("Platform").agg(mean("Daily_Usage_Time (minutes)").alias("Avg_Daily_Usage_Time"))
print("Average Daily Usage Time per Platform:")
avg_usage_by_platform.show()

In [None]:
# Descriptive Statistics (replace columns as needed)
print("Descriptive Statistics (Daily_Usage_Time, Age, Posts_Per_Day):")
data_df.describe("Daily_Usage_Time (minutes)", "Age", "Posts_Per_Day").show()

In [None]:
# Additional analysis and UDFs can be included here, showing output after each step

# Stop the SparkSession (local environment)
#spark.stop()

**My input file is input.csv.in WSL Ubuntu 22.4 i run dominant_emotion.py (mapreduce code),below is mapreduce code.**

hdfs dfs -put /tmp/input.csv /test_data/input.csv .I run this command in terminal to put it hdfs and run it in hadoop **python3 run_job.py** i store output getting from Hadoop in Hdfs .i download output.csv from HDFS


## TASK 1: Find out emotion count based on gender using Mapreduce in Hadoop framework using HDFS

![Example Image](Dominantemotion.jpg)


![Example Image](Hadoop_imosioncount.jpg)


![Example Image](hadoop.jpg)


![Example Image](get.jpg)



In [None]:
from mrjob.job import MRJob
from mrjob.step import MRStep

class DominantEmotion(MRJob):

    def steps(self):
        return [
            MRStep(mapper=self.mapper_get_emotions,
                   reducer=self.reducer_count_emotions)
        ]

    def mapper_get_emotions(self, _, line):
        # Skip the header
        if line.startswith('User_ID'):
            return

        parts = line.split(',')

        # Ensure there are enough parts in the line to avoid index errors
        if len(parts) > 9:
            # Extract gender and dominant emotion
            gender = parts[2]
            emotion = parts[9]

            yield gender, emotion

    def reducer_count_emotions(self, key, values):
        emotion_counts = {}
        for emotion in values:
            if emotion in emotion_counts:
                emotion_counts[emotion] += 1
            else:
                emotion_counts[emotion] = 1
        yield key, emotion_counts

if __name__ == '__main__':
    DominantEmotion.run()


## Drivercode

In [None]:
import csv
import subprocess
import tempfile
import os
from dominant_emotion import DominantEmotion

def run_mrjob(input_file, output_dir):
    # Create a temporary local file to store the intermediate output
    temp_output_file = tempfile.NamedTemporaryFile(delete=False, mode='w+')
    temp_output_filename = temp_output_file.name

    try:
        # Remove the existing output directory if it exists
        subprocess.run(['hdfs', 'dfs', '-rm', '-r', output_dir], check=False)

        # Run the MRJob with HDFS input and temporary local output
        mr_job = DominantEmotion(args=['-r', 'hadoop', input_file, '--output-dir', output_dir])

        with mr_job.make_runner() as runner:
            runner.run()

            # Read the intermediate output from HDFS
            subprocess.run(['hdfs', 'dfs', '-getmerge', f'{output_dir}/part-*', temp_output_filename], check=True)

            # Write the final output to the desired HDFS path
            final_output_path = f'{output_dir}/output_emotion.csv'
            with open(temp_output_filename, 'r') as infile:
                with open('final_output.csv', 'w', newline='') as csvfile:
                    writer = csv.writer(csvfile)
                    writer.writerow(['Gender', 'Emotion', 'Count'])

                    for line in infile:
                        key, value = line.strip().split('\t')
                        gender, emotion_counts = key, eval(value)
                        for emotion, count in emotion_counts.items():
                            writer.writerow([gender, emotion, count])

            # Put the final output CSV to HDFS
            subprocess.run(['hdfs', 'dfs', '-put', 'final_output.csv', final_output_path], check=True)

    finally:
        # Clean up temporary files
        os.remove(temp_output_filename)
        if os.path.exists('final_output.csv'):
            os.remove('final_output.csv')

if __name__ == '__main__':
    input_file = 'hdfs:///test_data/input.csv'  # Your input HDFS CSV file
    output_dir = 'hdfs:///output_data/'  # Your output HDFS directory
    run_mrjob(input_file, output_dir)


## TASK 2 To find out count Based on Emotion using Mapreduce
Mapper: mapper_extract_emotion
The mapper function extracts the dominant emotion from each line of input and yields it along with a count of 1. This function is called once for each line of the input file.
Reducer: reducer_count_emotions
The reducer function aggregates the counts of each emotion emitted by the mapper.


![Example Image](emptioncount.jpg)


In [None]:
from mrjob.job import MRJob
from mrjob.step import MRStep

class EmotionCounts(MRJob):

    def steps(self):
        return [
            MRStep(mapper=self.mapper_extract_emotion,
                   reducer=self.reducer_count_emotions)
        ]

    def mapper_extract_emotion(self, _, line):
        # Skip the header
        if line.startswith('User_ID'):
            return

        parts = line.split(',')

        # Ensure there are enough parts in the line to avoid index errors
        if len(parts) > 9:
            # Extract dominant emotion
            emotion = parts[9]

            yield emotion, 1

    def reducer_count_emotions(self, emotion, counts):
        yield emotion, sum(counts)

if __name__ == '__main__':
    EmotionCounts.run()


# Driver code

In [None]:
import csv
import subprocess
import tempfile
import os
from emotion_counts import EmotionCounts

def run_mrjob(input_file, output_dir):
    # Create a temporary local file to store the intermediate output
    temp_output_file = tempfile.NamedTemporaryFile(delete=False, mode='w+')
    temp_output_filename = temp_output_file.name

    try:
        # Remove the existing output directory if it exists
        subprocess.run(['hdfs', 'dfs', '-rm', '-r', output_dir], check=False)

        # Run the MRJob with HDFS input and temporary local output
        mr_job = EmotionCounts(args=['-r', 'hadoop', input_file, '--output-dir', output_dir])

        with mr_job.make_runner() as runner:
            runner.run()

            # Read the intermediate output from HDFS
            subprocess.run(['hdfs', 'dfs', '-getmerge', f'{output_dir}/part-*', temp_output_filename], check=True)

            # Write the final output to the desired HDFS path
            final_output_path = f'{output_dir}/emotion_counts.csv'
            with open(temp_output_filename, 'r') as infile:
                with open('final_output.csv', 'w', newline='') as csvfile:
                    writer = csv.writer(csvfile)
                    writer.writerow(['Emotion', 'Count'])

                    for line in infile:
                        emotion, count = line.strip().split('\t')
                        writer.writerow([emotion, count])

            # Put the final output CSV to HDFS
            subprocess.run(['hdfs', 'dfs', '-put', 'final_output.csv', final_output_path], check=True)

    finally:
        # Clean up temporary files
        os.remove(temp_output_filename)
        if os.path.exists('final_output.csv'):
            os.remove('final_output.csv')

if __name__ == '__main__':
    input_file = 'hdfs:///test_data/input.csv'  # Your input HDFS CSV file
    output_dir = 'hdfs:///output_data/'  # Your output HDFS directory
    run_mrjob(input_file, output_dir)


## TASK3 :Average Daily Usage Time by Platform by gender

Mapper: Reads each line of the CSV, extracts the platform and daily usage time, and emits a tuple (platform, daily usage time).

Reducer: Aggregates the total daily usage time and the count of users per platform, then calculates the average daily usage time for each platform.

![Example Image](output_gender.jpg)


In [None]:
import csv
import subprocess
import tempfile
import os
from platform_by_gender import PlatformByGender
from mrjob.job import MRJob

# Function to run the MapReduce job
def run_mrjob(input_file, output_dir):
    # Create a temporary local file to store the intermediate output
    temp_output_file = tempfile.NamedTemporaryFile(delete=False)
    temp_output_filename = temp_output_file.name

    try:
        # Remove the existing output directory if it exists
        subprocess.run(['hdfs', 'dfs', '-rm', '-r', output_dir], check=False)

        # Run the MRJob with HDFS input and temporary local output
        mr_job = PlatformByGender(args=['-r', 'hadoop', input_file, '--output-dir', output_dir])

        with mr_job.make_runner() as runner:
            runner.run()

            # Read the intermediate output from HDFS
            subprocess.run(['hdfs', 'dfs', '-getmerge', f'{output_dir}/part-*', temp_output_filename], check=True)

            # Write the final output to the desired HDFS path
            final_output_path = f'{output_dir}/output_gender.csv'
            with open(temp_output_filename, 'r') as infile:
                with open('final_output.csv', 'w', newline='') as csvfile:
                    writer = csv.writer(csvfile)
                    writer.writerow(['Gender', 'Platform', 'Count'])

                    # Read each line and split it by tab ('\t') to separate key and value
                    for line in infile:
                        key, value = line.strip().split('\t')
                        gender, platform = eval(key)  # Parse key
                        count = int(value)  # Convert value to integer
                        writer.writerow([gender, platform, count])

            # Put the final output CSV to HDFS
            subprocess.run(['hdfs', 'dfs', '-put', 'final_output.csv', final_output_path], check=True)

    finally:
        # Clean up temporary files
        os.remove(temp_output_filename)
        if os.path.exists('final_output.csv'):
            os.remove('final_output.csv')

if __name__ == '__main__':
    input_file = 'hdfs:///test_data/input.csv'  #  input HDFS CSV file
    output_dir = 'hdfs:///output_data/'  #  output HDFS directory
    run_mrjob(input_file, output_dir)


In [None]:
from mrjob.job import MRJob
from mrjob.step import MRStep

class PlatformByGender(MRJob):

    def mapper(self, _, line):
        # Skip the header
        if line.startswith("User_ID"):
            return

        fields = line.split(',')
        if len(fields) != 10:
            return

        user_id, age, gender, platform, daily_usage, posts_per_day, likes_received, comments_received, messages_sent, dominant_emotion = fields

        yield (gender, platform), 1

    def reducer(self, key, values):
        gender, platform = key
        yield (gender, platform), sum(values)

if __name__ == '__main__':
    PlatformByGender.run()



## Task 4: Post count based on Platform
Mapper: Reads each line of the CSV, extracts gender and Post received, and emits a tuple (gender, post received).

Reducer: Aggregates the total likes received for each gender.

![Example Image](platform_post_count.jpg)


In [None]:
import csv
import subprocess
import tempfile
import os
from platform_by_gender import PlatformByGender
from mrjob.job import MRJob

# Function to run the MapReduce job
def run_mrjob(input_file, output_dir):
    # Create a temporary local file to store the intermediate output
    temp_output_file = tempfile.NamedTemporaryFile(delete=False)
    temp_output_filename = temp_output_file.name

    try:
        # Remove the existing output directory if it exists
        subprocess.run(['hdfs', 'dfs', '-rm', '-r', output_dir], check=False)

        # Run the MRJob with HDFS input and temporary local output
        mr_job = PlatformByGender(args=['-r', 'hadoop', input_file, '--output-dir', output_dir])

        with mr_job.make_runner() as runner:
            runner.run()

            # Read the intermediate output from HDFS
            subprocess.run(['hdfs', 'dfs', '-getmerge', f'{output_dir}/part-*', temp_output_filename], check=True)

            # Write the final output to the desired HDFS path
            final_output_path = f'{output_dir}/output_gender.csv'
            with open(temp_output_filename, 'r') as infile:
                with open('final_output.csv', 'w', newline='') as csvfile:
                    writer = csv.writer(csvfile)
                    writer.writerow(['Gender', 'Platform', 'Count'])

                    # Read each line and split it by tab ('\t') to separate key and value
                    for line in infile:
                        key, value = line.strip().split('\t')
                        gender, platform = eval(key)  # Parse key
                        count = int(value)  # Convert value to integer
                        writer.writerow([gender, platform, count])

            # Put the final output CSV to HDFS
            subprocess.run(['hdfs', 'dfs', '-put', 'final_output.csv', final_output_path], check=True)

    finally:
        # Clean up temporary files
        os.remove(temp_output_filename)
        if os.path.exists('final_output.csv'):
            os.remove('final_output.csv')

if __name__ == '__main__':
    input_file = 'hdfs:///test_data/input.csv'  #  input HDFS CSV file
    output_dir = 'hdfs:///output_data/'  #  output HDFS directory
    run_mrjob(input_file, output_dir)


## Driver code

In [None]:
import csv
import subprocess
import tempfile
import os
from platform_by_gender import PlatformByGender
from mrjob.job import MRJob

# Function to run the MapReduce job
def run_mrjob(input_file, output_dir):
    # Create a temporary local file to store the intermediate output
    temp_output_file = tempfile.NamedTemporaryFile(delete=False)
    temp_output_filename = temp_output_file.name

    try:
        # Remove the existing output directory if it exists
        subprocess.run(['hdfs', 'dfs', '-rm', '-r', output_dir], check=False)

        # Run the MRJob with HDFS input and temporary local output
        mr_job = PlatformByGender(args=['-r', 'hadoop', input_file, '--output-dir', output_dir])

        with mr_job.make_runner() as runner:
            runner.run()

            # Read the intermediate output from HDFS
            subprocess.run(['hdfs', 'dfs', '-getmerge', f'{output_dir}/part-*', temp_output_filename], check=True)

            # Write the final output to the desired HDFS path
            final_output_path = f'{output_dir}/output_gender.csv'
            with open(temp_output_filename, 'r') as infile:
                with open('final_output.csv', 'w', newline='') as csvfile:
                    writer = csv.writer(csvfile)
                    writer.writerow(['Gender', 'Platform', 'Count'])

                    # Read each line and split it by tab ('\t') to separate key and value
                    for line in infile:
                        key, value = line.strip().split('\t')
                        gender, platform = eval(key)  # Parse key
                        count = int(value)  # Convert value to integer
                        writer.writerow([gender, platform, count])

            # Put the final output CSV to HDFS
            subprocess.run(['hdfs', 'dfs', '-put', 'final_output.csv', final_output_path], check=True)

    finally:
        # Clean up temporary files
        os.remove(temp_output_filename)
        if os.path.exists('final_output.csv'):
            os.remove('final_output.csv')

if __name__ == '__main__':
    input_file = 'hdfs:///test_data/input.csv'  #  input HDFS CSV file
    output_dir = 'hdfs:///output_data/'  #  output HDFS directory
    run_mrjob(input_file, output_dir)
