In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

In [2]:
df = pd.read_csv("reddit.csv")

# Data preprocessing

In [3]:
print("Feature and Datatype")
print("=="*40)
print("=="*40)
print(df.info())
print("=="*40)
print("statistical summary")
print("=="*40)
print("=="*40)
print(df.describe())
print("=="*40)
print("Number of empty values")
print("=="*40)
print("=="*40)
print(df.isnull().sum())

print("=="*40)
print(df.dropna(inplace=True))

df.head()

Feature and Datatype
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37249 entries, 0 to 37248
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   clean_comment  37149 non-null  object
 1   category       37249 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 582.1+ KB
None
statistical summary
           category
count  37249.000000
mean       0.202771
std        0.778515
min       -1.000000
25%        0.000000
50%        0.000000
75%        1.000000
max        1.000000
Number of empty values
clean_comment    100
category           0
dtype: int64
None


Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


In [4]:
df.rename(columns={'clean_comment': 'comment'}, inplace=True)

In [5]:
# basic text cleaning
def clean_comment_block(df):
    df['clean_comment'] = (
        df['comment']
          .fillna('')
          .astype(str)
          .str.lower()                                         # lower case
          .str.replace(r'http\S+|www\.\S+', ' ', regex=True)   # URLs
          .str.replace(r'<.*?>', ' ', regex=True)              # HTML tags
          .str.replace(r'&\w+;', ' ', regex=True)              # HTML entities
          .str.replace(r'@\w+', ' ', regex=True)               # @mentions
          .str.replace(r'#', '', regex=True)                   # drop hashtag symbol
          .str.replace(r"[^a-z\s']", ' ', regex=True)          # keep letters/space/' 
          .str.replace(r'\s+', ' ', regex=True)                # collapse spaces
          .str.replace(r'\\n', ' ', regex=True)                # literal "\n"
          .str.replace(r'[\r\n\t]+', ' ', regex=True)          # actual newlines/tabs
          .str.replace(r'\s{2,}', ' ', regex=True)             # multiple spaces -> single
          .str.strip()
    )
    #
    df = df[~(df['clean_comment'].str.strip() == "")]
    return df

# usage:
df = clean_comment_block(df)

In [6]:

def remove_duplicates(df):
    # how many duplicates
    print(f'Number of duplicates:{df['clean_comment'].duplicated().sum()}')
    # preview all duplicate rows
    print("=="*40)
    print("Preview first 20 duplicate rows")
    print("=="*40)
    print("=="*40)
    print(df[df['clean_comment'].duplicated(keep=False)].sort_values('clean_comment').head(20))
    # drop duplicates (keep first)
    df = df.drop_duplicates(subset='clean_comment', keep='first').reset_index(drop=True)
    return df
df = remove_duplicates(df) 

Number of duplicates:450
Preview first 20 duplicate rows
            comment  category clean_comment
19086       aadhar          0        aadhar
13868       aadhar          0        aadhar
4059           aap          0           aap
3035           aap          0           aap
2643    about time          0    about time
2648    about time          0    about time
27882    about time         0    about time
14297   absolutely          1    absolutely
29567    absolutely         1    absolutely
20081    acche din          0     acche din
20321    acche din          0     acche din
4525      acche din         0     acche din
29838    acche din          0     acche din
12828    acche din          0     acche din
5852     acche din™         0     acche din
7000      ache din          0      ache din
18789     ache din          0      ache din
29432    achhe din          0     achhe din
32040    achhe din          0     achhe din
15045     achhe din         0     achhe din


In [7]:
df['category'].value_counts()

category
 1    15721
 0    12496
-1     8233
Name: count, dtype: int64

# Exploratory Data Analysis

In [8]:
import plotly.express as px

def plot_category_distribution(df, col='category', kind='bar',title=None):
    s = df[col]
    counts = s.value_counts()

    data = counts.reset_index()
    data.columns = [col, 'count']
    data['percent'] = (data['count'] / data['count'].sum() * 100).round(2)

    if title is None:
        title = f"Distribution of {col} (n={len(s)})"

    if kind == 'pie':
        fig = px.pie(
            data, names=col, values='count',
            title=title, hole=0.35
        )
        fig.update_traces(textinfo='percent+label')
    else:
        order = data.sort_values('count', ascending=False)[col].tolist()
        fig = px.bar(
            data, x=col, y='count', text='count',
            title=title, color=col, category_orders={col: order}
        )
        fig.update_traces(textposition='outside')
        fig.update_layout(
            xaxis_title=col, yaxis_title='Count',
            showlegend=False, xaxis_tickangle=-45
        )

    fig.show()
    return fig, data

# usage
plot_category_distribution(df, col='category', kind='bar')
# plot_category_distribution(df, col='category', kind='pie', top_n=10)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [68]:
df.columns

Index(['comment', 'category', 'clean_comment'], dtype='object')

In [None]:
# count words in each clean_comment
df['word_count'] = df['clean_comment'].fillna('').str.split().str.len()

# (optional, a bit more robust to punctuation)
# df['word_count'] = df['clean_comment'].fillna('').str.findall(r'\b\w+\b').str.len()

# quick check
df[['clean_comment', 'word_count']].head()

In [None]:
# optional: remove English stopwords (uncomment if sklearn available)
# from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
# sw = ENGLISH_STOP_WORDS
# df['clean_comment'] = df['clean_comment'].apply(lambda t: ' '.join(w for w in t.split() if w not in sw))