In [None]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import sketch 
from dataprep.eda import plot
from dataprep.clean import clean_text

In [None]:
df = pd.read_csv('/kaggle/input/reddit-on-israel-palestine-daily-updated/pse_isr_reddit_comments.csv', parse_dates=['created_time'])

In [None]:
plot(df)

In [None]:
plot(df, 'self_text')

In [None]:
df['created_time'].dt.time.value_counts().nlargest(5).plot(kind='barh', title='Most frequency time created', figsize=(8,8))

In [None]:
fig = plt.figure(figsize=(14,6))
fig.add_subplot(121)
df.resample('1H', on='created_time')['self_text'].count().plot(title='Self Text created after 1 hours')

fig.add_subplot(122)
df.resample('1H', on='created_time')['self_text'].count().cumsum().plot(title='Self Text after 1 hours cumulative')
plt.tight_layout()
plt.show()

In [None]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

In [None]:
df = clean_text(df, 'self_text', stopwords=list(STOP_WORDS))

In [None]:
import eng_spacysentiment
nlp = eng_spacysentiment.load()
sentiment = {'positive': [],
            'negative': [],
            'neutral': []}
for value in df.itertuples():
    doc = nlp(value.self_text)
    sentiment['positive'].append(doc.cats['positive'])
    sentiment['negative'].append(doc.cats['negative'])
    sentiment['neutral'].append(doc.cats['neutral'])

In [None]:
# Create bins for the score column
bins = [-1000, 0, 1, 5, 10, 17000]

# Create labels for the bins
labels = ['Strongly Negative', 'Negative', 'Neutral', 'Positive', 'Strongly Positive']

# Create a new column and assign the binned values
df.insert(2, 'score_label', pd.cut(df['score'], bins=bins, labels=labels))
df.insert(4, 'positive', sentiment['positive'])
df.insert(5, 'negative', sentiment['negative'])
df.insert(6, 'neutral', sentiment['neutral'])

In [None]:
plot(df[['positive', 'negative', 'neutral']])

In [None]:
fig = plt.figure(figsize=(14,9))
fig.add_subplot(311)
df.query('positive >= 0.5')['subreddit'].value_counts().plot(kind='barh', title='Subreddit with highest positive comment')
fig.add_subplot(312)
df.query('negative >= 0.5')['subreddit'].value_counts().plot(kind='barh', title='Subreddit with highest negative comment')
fig.add_subplot(313)
df.query('neutral >= 0.5')['subreddit'].value_counts().plot(kind='barh', title='Subreddit with highest neutral comment')
plt.tight_layout()
plt.show()

## Positive Comment 

In [None]:
plot(df.query('positive >= 0.5'), 'self_text')

## Negative Comment

In [None]:
plot(df.query('negative >= 0.5'), 'self_text')

## Neutral Comment

In [None]:
plot(df.query('neutral >= 0.5'), 'self_text')

In [None]:
sns.pairplot(df, vars=['positive', 'negative', 'neutral'],hue='score_label')

In [None]:
sns.catplot(data=df, y='subreddit', x='positive', hue='score_label')
sns.catplot(data=df, y='subreddit', x='negative', hue='score_label')
sns.catplot(data=df, y='subreddit', x='neutral', hue='score_label')

In [None]:
df.nlargest(10, 'score')