In [1]:
import pandas as pd

# Load the Excel file
file_path = 'Sample Qualitative Data-COVID Study.xlsx'
df = pd.read_excel(file_path)

# Display the first few rows of the dataframe to understand its structure
df.head()


Unnamed: 0,StartDate,Progress,Impact
0,Start Date,Progress,Describe how the COVID-19 pandemic has impacte...
1,2020-05-04 23:29:03,100,I was not able to work and online classes were...
2,2020-05-04 23:27:48,100,COVID-19 has limited my social interactions as...
3,2020-05-04 23:31:01,100,I am not working right now and I feel like I a...
4,2020-05-04 23:27:45,100,At first the shelter in place did not bother m...


In [2]:
from textblob import TextBlob
impact_responses = df['Impact'][1:] 
# Adjusting the approach to directly handle the data within the DataFrame
# Reinitializing the sentiment categories list to ensure it starts empty
sentiment_categories = []

# Analyze each response for sentiment and categorize it, including the header row this time
for index, row in df.iterrows():
    if index == 0:  # Skip analysis for the header row and assign a placeholder
        sentiment_categories.append("Sentiment")
        continue

    analysis = TextBlob(str(row['Impact']))  # Ensure the response is in string format for analysis
    polarity = analysis.sentiment.polarity  # Get the polarity score: -1 (negative) to 1 (positive)

    # Categorize the sentiment based on polarity
    if polarity > 0:
        sentiment = 'Positive'
    elif polarity <0:
        sentiment = 'Negative'
    else:
        sentiment = 'Neutral'  # Neutral sentiment for polarity == 0

    sentiment_categories.append(sentiment)

# The length of sentiment_categories list should now match the DataFrame's length
assert len(sentiment_categories) == len(df)

# Inserting the sentiment categories back into the dataframe
df['Sentiment'] = sentiment_categories

# Display the updated dataframe with sentiment categories, excluding the first row which contains the header
df[['Impact', 'Sentiment']].iloc[1:].head()


Unnamed: 0,Impact,Sentiment
1,I was not able to work and online classes were...,Negative
2,COVID-19 has limited my social interactions as...,Positive
3,I am not working right now and I feel like I a...,Positive
4,At first the shelter in place did not bother m...,Positive
5,"School stress has increased, as I am strugglin...",Negative


In [3]:
# Defining themes and associated keywords
themes_keywords = {
    "Work": ["work", "job", "unemployed", "work from home"],
    "Health": ["sick", "health", "hospital", "doctor"],
    "Social Life": ["friends", "family", "isolated", "social"],
    "Education": ["online classes", "school", "study", "education"],
    "Mental Health": ["stress", "anxiety", "depressed", "mental health"],
    "Self Change": ["change", "learn", "improve", "new"]
}

# Initialize a dictionary to hold the theme count for each sentiment category
theme_counts = {theme: {"Positive": 0,"Negative": 0, "Neutral": 0} for theme in themes_keywords}

# Adjusting the identify_themes function to handle non-string responses
def identify_themes_adjusted(response):
    identified_themes = set()
    # Ensure the response is treated as a string
    response_str = str(response)
    for theme, keywords in themes_keywords.items():
        for keyword in keywords:
            if keyword.lower() in response_str.lower():
                identified_themes.add(theme)
    if len(identified_themes) == 0:
        identified_themes.add('Self Change')
    return identified_themes

# Re-scan each response for themes and tally the counts within sentiment categories
# Reinitialize the theme counts to ensure it starts from zero
theme_counts = {theme: {"Positive": 0,"Negative": 0, "Neutral": 0} for theme in themes_keywords}

for index, row in df.iloc[1:].iterrows():  # Skip the first row which is a header
    response = row['Impact']
    sentiment = row['Sentiment']
    response_themes = identify_themes_adjusted(response)
   
    # add a theme column to the dataframe
    df.at[index, 'Themes'] = ', '.join(response_themes)

    for theme in response_themes:
        if sentiment in theme_counts[theme]:
            theme_counts[theme][sentiment] += 1
# calculate the frequency count for each theme
theme_counts_freq = {theme: sum(counts.values()) for theme, counts in theme_counts.items()} 


In [4]:
theme_counts

{'Work': {'Positive': 226, 'Negative': 115, 'Neutral': 41},
 'Health': {'Positive': 130, 'Negative': 102, 'Neutral': 5},
 'Social Life': {'Positive': 424, 'Negative': 196, 'Neutral': 39},
 'Education': {'Positive': 306, 'Negative': 148, 'Neutral': 39},
 'Mental Health': {'Positive': 194, 'Negative': 112, 'Neutral': 15},
 'Self Change': {'Positive': 365, 'Negative': 160, 'Neutral': 107}}

In [5]:
df.to_excel("content_analysis.xlsx")

In [6]:
pd.DataFrame(theme_counts_freq,index=[0]).to_excel("theme_counts.xlsx")

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1504 entries, 0 to 1503
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   StartDate  1504 non-null   object
 1   Progress   1504 non-null   object
 2   Impact     1475 non-null   object
 3   Sentiment  1504 non-null   object
 4   Themes     1503 non-null   object
dtypes: object(5)
memory usage: 58.9+ KB


: 