<a href="https://colab.research.google.com/github/awaleedpk/Kaggle_Notebooks/blob/main/Harvard_University_Ratings_and_Reviews_EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import pandas as pd
import plotly.express as px
import datetime
import plotly.graph_objects as go
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np


In [7]:
reviews_df = pd.read_csv('/content/harvard_reviews.csv')

In [8]:
reviews_df.head()


Unnamed: 0,published_date,published_platform,rating,type,helpful_votes,title,text
0,2023-12-28T08:02:14-05:00,Mobile,5,review,0,Best classes and good environment,Good thanks for everything good work group 👍 h...
1,2023-12-12T00:38:26-05:00,Desktop,4,review,0,Harvard University,Harvard University was founded in 1636 and is ...
2,2023-12-10T08:21:35-05:00,Mobile,3,review,0,Walk around campus,We did a walk around most of the Harvard Campu...
3,2023-11-07T21:20:21-05:00,Desktop,4,review,0,Interesting Harvard University,We had a walk through the university grounds w...
4,2023-10-02T17:40:42-04:00,Desktop,5,review,0,Lovely university campus has many historic and...,Finally made it to Harvard!! This iconic univ...


In [9]:

# Adjusting for timezone-aware datetime conversion
reviews_df['published_date'] = pd.to_datetime(reviews_df['published_date'], utc=True)

# Creating the scatter plot
fig = px.scatter(reviews_df, x='published_date', y='rating', color='published_platform',
                 title='Harvard University Ratings Over Time by Platform',
                 labels={'published_date': 'Published Date', 'rating': 'Rating'},
                 color_discrete_map={'Mobile': 'blue', 'Desktop': 'green'})

fig.update_traces(marker=dict(size=8, opacity=0.5), selector=dict(mode='markers'))
fig.show()


In [10]:
import pandas as pd
import plotly.graph_objects as go

# Assuming reviews_df is your DataFrame

# Convert published_date to datetime (adjusting for timezones if not already done)
reviews_df['published_date'] = pd.to_datetime(reviews_df['published_date'], utc=True)

# Extract year and month for grouping
reviews_df['year_month'] = reviews_df['published_date'].dt.to_period('M')

# Aggregate counts by year_month and published_platform
platform_counts = reviews_df.groupby(['year_month', 'published_platform']).size().unstack(fill_value=0)

# Resetting index to make 'year_month' a column
platform_counts_reset = platform_counts.reset_index()

# Creating the Plotly figure for a stacked bar chart
fig = go.Figure()

# Adding traces for each platform
for platform in platform_counts_reset.columns[1:]:  # Skipping the first column which is 'year_month'
    fig.add_trace(go.Bar(
        x=platform_counts_reset['year_month'].astype(str),  # Converting to string for Plotly
        y=platform_counts_reset[platform],
        name=platform,
        # Optional: Assign specific colors to each platform if desired
    ))

# Updating layout for the figure
fig.update_layout(
    barmode='stack',
    title='Count of Reviews by Platform Over Time',
    xaxis_title='Year and Month',
    yaxis_title='Count of Reviews',
    legend_title='Platform',
    xaxis={'type': 'category'}  # Ensuring x-axis treats dates as discrete categories
)

# Displaying the figure
fig.show()



Converting to PeriodArray/Index representation will drop timezone information.



In [11]:
from textblob import TextBlob

# Function to categorize sentiment
def categorize_sentiment(text):
    sentiment = TextBlob(text).sentiment.polarity
    if sentiment > 0:
        return 'Positive'
    elif sentiment == 0:
        return 'Neutral'
    else:
        return 'Negative'

# Applying sentiment analysis on the review texts
reviews_df['sentiment'] = reviews_df['text'].apply(categorize_sentiment)

# Previewing the updated DataFrame with sentiment categories
reviews_df[['text', 'sentiment']].head()


Unnamed: 0,text,sentiment
0,Good thanks for everything good work group 👍 h...,Positive
1,Harvard University was founded in 1636 and is ...,Positive
2,We did a walk around most of the Harvard Campu...,Positive
3,We had a walk through the university grounds w...,Positive
4,Finally made it to Harvard!! This iconic univ...,Positive


In [12]:


# Assuming you've already added the 'sentiment' and 'year_month' columns to reviews_df

# Aggregating sentiment categories over time
sentiment_counts = reviews_df.groupby(['year_month', 'sentiment']).size().unstack(fill_value=0).reset_index()

# Creating the Plotly figure for a stacked bar chart with sentiment categories
fig = go.Figure()

# Assigning colors to each sentiment
colors = {'Positive': 'green', 'Neutral': 'grey', 'Negative': 'red'}

# Adding traces for each sentiment category
for sentiment in sentiment_counts.columns[1:]:  # Skipping 'year_month'
    fig.add_trace(go.Bar(
        x=sentiment_counts['year_month'].astype(str),
        y=sentiment_counts[sentiment],
        name=sentiment,
        marker_color=colors[sentiment]
    ))

# Updating layout for the figure
fig.update_layout(
    barmode='stack',
    title='Sentiment of Reviews Over Time',
    xaxis_title='Year and Month',
    yaxis_title='Count of Reviews',
    legend_title='Sentiment',
    xaxis={'type': 'category'}  # Ensuring x-axis treats dates as discrete categories
)

fig.show()


In [13]:


# Assuming you have the reviews_df DataFrame already prepared

# Selecting reviews with the highest helpful_votes
top_helpful_reviews = reviews_df.nlargest(20, 'helpful_votes')

# Creating a bar chart to highlight these reviews
fig = px.bar(top_helpful_reviews, x='title', y='helpful_votes',
             text='helpful_votes', color='helpful_votes',
             labels={'title': 'Review Title', 'helpful_votes': 'Number of Helpful Votes'},
             title='Top 10 Most Helpful Reviews')

# Enhancing the chart for better readability
fig.update_traces(texttemplate='%{text}', textposition='outside')
fig.update_layout(xaxis_tickangle=-45, uniformtext_minsize=8, uniformtext_mode='hide')

fig.show()


In [14]:

# Extracting keywords from the 'text' column
vectorizer = CountVectorizer(stop_words='english', max_features=10)  # Limiting to top 10 most common words
X = vectorizer.fit_transform(reviews_df['text'])

# Getting the feature names which represent the keywords
keywords = vectorizer.get_feature_names_out()

# Summing up the occurrences of each keyword
keyword_counts = np.array(X.sum(axis=0)).flatten()

# Calculating average rating for reviews containing each keyword
keyword_ratings = []
for keyword in keywords:
    keyword_mask = reviews_df['text'].str.contains(keyword, case=False, na=False)
    keyword_ratings.append(reviews_df.loc[keyword_mask, 'rating'].mean())

# Creating a DataFrame for the visualization
keywords_df = pd.DataFrame({
    'Keyword': keywords,
    'Frequency': keyword_counts,
    'Average Rating': keyword_ratings
})

# Sorting by Frequency for better visualization
keywords_df = keywords_df.sort_values(by='Frequency', ascending=False)

# Creating a bar chart with Plotly
fig = px.bar(keywords_df, x='Keyword', y='Frequency',
             color='Average Rating',
             labels={'Frequency': 'Frequency of Keyword', 'Average Rating': 'Average Rating'},
             title='Top Keywords by Frequency and Average Rating',
             color_continuous_scale=px.colors.diverging.Tealrose)

fig.show()


In [15]:
import plotly.express as px

# Assuming 'type' column distinguishes between academic and tourism-focused reviews
fig_rating = px.box(reviews_df, x='type', y='rating',
                    color='type',
                    title='Average Rating Comparison Between Academic and Tourism-Focused Reviews')

fig_rating.show()


In [16]:
sentiment_comparison_df = reviews_df.groupby(['type', 'sentiment']).size().unstack().reset_index()

# Plotting
fig_sentiment = px.bar(sentiment_comparison_df, x='type', y=['Positive', 'Neutral', 'Negative'],
                       barmode='group',
                       title='Sentiment Comparison Between Academic and Tourism-Focused Reviews')

fig_sentiment.show()
