In [2]:
import pandas as pd
# Carregar o arquivo CSV
df = pd.read_csv('D:/OneDrive - InMotion - Consulting/DocPlanner-BusinessCase/Business_Case_Reference/Case_Data/Business Case PC PM - database - raw_Data.csv', encoding='utf-8')
# Exibir as primeiras linhas do dataframe para inspecionar os dados
df.head()

Unnamed: 0,country_code,type,created_at,content_en,@
0,br,opinion,2024-04-06,"A doctor who is not very inclusive, it seems t...",
1,br,opinion,2024-04-12,Although the online schedule showed availabili...,
2,br,opinion,2024-04-22,Didn't answer my wife. After traveling more th...,
3,br,opinion,2024-04-27,\nHe works at the Santo Amaro rehabilitation c...,
4,br,opinion,2024-04-29,This psychologist canceled the 1st appointment...,


In [3]:
# 1. Identificar Colunas com Valores Ausentes
# Isso exibirá o número de valores ausentes em cada coluna
missing_values = df.isnull().sum()
print("Valores ausentes por coluna:")
print(missing_values)
# 2. Analisar a Quantidade de Valores Ausentes
# ver como uma porcentagem do total
missing_percentage = (df.isnull().sum() / len(df)) * 100
print("\nPercentual de valores ausentes por coluna:")
print(missing_percentage)
# Exibir as primeiras linhas para confirmar as mudanças
print("\nPrimeiras linhas dos dados após tratamento:")
print(df.head())

Valores ausentes por coluna:
country_code        0
type                0
created_at          0
content_en          2
@               44853
dtype: int64

Percentual de valores ausentes por coluna:
country_code      0.000000
type              0.000000
created_at        0.000000
content_en        0.004459
@               100.000000
dtype: float64

Primeiras linhas dos dados após tratamento:
  country_code     type  created_at  \
0           br  opinion  2024-04-06   
1           br  opinion  2024-04-12   
2           br  opinion  2024-04-22   
3           br  opinion  2024-04-27   
4           br  opinion  2024-04-29   

                                          content_en   @  
0  A doctor who is not very inclusive, it seems t... NaN  
1  Although the online schedule showed availabili... NaN  
2  Didn't answer my wife. After traveling more th... NaN  
3  \nHe works at the Santo Amaro rehabilitation c... NaN  
4  This psychologist canceled the 1st appointment... NaN  


In [4]:
# Remover a coluna @ do DataFrame
df = df.drop(columns=['@'])
# Verificar se a coluna foi removida com sucesso
print("Colunas após a remoção:")
print(df.columns)

Colunas após a remoção:
Index(['country_code', 'type', 'created_at', 'content_en'], dtype='object')


In [5]:
# Filtrar as linhas onde o valor na coluna 'content_en' está ausente (NaN)
missing_content_en = df[df['content_en'].isnull()]
# Exibir essas linhas para análise
print("Linhas onde 'content_en' está ausente:")
print(missing_content_en)

Linhas onde 'content_en' está ausente:
      country_code     type  created_at content_en
14108           br  opinion  2024-05-29        NaN
23885           de  opinion  2024-04-09        NaN


In [6]:
# Filtrar as linhas onde o valor na coluna 'content_en' está ausente (NaN)
# {{ edit_1 }}
df = df[df['content_en'].notnull()]  # Remove linhas com 'content_en' ausente
# {{ edit_2 }}
# 2. Analisar a Quantidade de Valores Ausentes
# ver como uma porcentagem do total
missing_percentage = (df.isnull().sum() / len(df)) * 100
print("\nPercentual de valores ausentes por coluna:")
print(missing_percentage)


Percentual de valores ausentes por coluna:
country_code    0.0
type            0.0
created_at      0.0
content_en      0.0
dtype: float64


In [7]:
# Exibir o número total de linhas no DataFrame com separador de milhares
total_linhas = len(df)
print(f"Número total de linhas no DataFrame: {total_linhas:,}")

Número total de linhas no DataFrame: 44,851


---

## 1. Basic Text Preprocessing

---

versao 1.0

In [8]:
import nltk  # Library for natural language processing (NLP), provides tools for tokenization, lemmatization, and more
from nltk.corpus import stopwords  # Module to access lists of stopwords (common words usually removed during text processing)
from nltk.tokenize import word_tokenize  # Function to split text into words (tokenization)
from nltk.stem import WordNetLemmatizer  # Function to reduce words to their root form (lemmatization)
import pandas as pd  # Library for data manipulation and analysis, especially useful for working with tabular data
import os  # Library for interacting with the operating system, e.g., for file and directory manipulation
import re  # Library for regular expression operations, useful for searching and manipulating patterns in strings
import matplotlib.pyplot as plt  # Library for creating graphs and visualizations
from wordcloud import WordCloud  # Function to generate word clouds, a visualization that highlights frequent words in a text
from textblob import TextBlob  # Library for text processing, provides functionalities like sentiment analysis and grammatical correction

# Download necessary NLTK data (tokenizers, stopwords, and WordNet for lemmatization)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Specify the directory for NLTK data storage
nltk_data_dir = os.path.join(os.path.expanduser('~'), 'nltk_data')
if not os.path.exists(nltk_data_dir):
    os.makedirs(nltk_data_dir)  # Create the directory if it doesn't exist
nltk.data.path.append(nltk_data_dir)  # Add the directory to NLTK's data path

# Define a set of words to keep (includes negations and important medical terms)
words_to_keep = set(['not', 'no', 'never', 'doctor', 'nurse', 'patient', 'hospital', 'clinic', 'treatment', 'medicine'])

# Get the list of English stopwords
stop_words = set(stopwords.words('english')) - words_to_keep  # Remove the words we want to keep from the stopwords list

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    """
    Preprocess the input text by cleaning and normalizing it.
    """
    if isinstance(text, str):
        # Convert to lowercase
        text = text.lower()
        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        # Keep alphanumeric characters, spaces, and important punctuation
        text = re.sub(r'[^a-zA-Z0-9\s!?.]', '', text)
        # Tokenize the text
        tokens = word_tokenize(text)
        # Lemmatize and remove stopwords, but keep important words
        tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words or word in words_to_keep]
        # Join the cleaned tokens back into a single string
        return ' '.join(tokens)
    else:
        return ''

def get_sentiment(text):
    """
    Calculate the sentiment score of the input text using TextBlob.
    """
    return TextBlob(text).sentiment.polarity

# Calculate sentiment before preprocessing
df['original_sentiment'] = df['content_en'].apply(get_sentiment)

# Apply the preprocessing function to the 'content_en' column of the DataFrame
df['cleaned_content'] = df['content_en'].apply(preprocess_text)

# Calculate sentiment after preprocessing
df['cleaned_sentiment'] = df['cleaned_content'].apply(get_sentiment)

# Calculate the absolute difference in sentiment scores
df['sentiment_difference'] = abs(df['original_sentiment'] - df['cleaned_sentiment'])

# Display the original, cleaned content, and sentiment scores
print(df[['content_en', 'cleaned_content', 'original_sentiment', 'cleaned_sentiment', 'sentiment_difference']].head(10))

# Analyze text length before and after preprocessing
df['original_length'] = df['content_en'].str.len()
df['cleaned_length'] = df['cleaned_content'].str.len()

print("\nAverage text length:")
print(f"Original: {df['original_length'].mean():.2f}")
print(f"Cleaned: {df['cleaned_length'].mean():.2f}")

# Specify the output directory for saving files
output_dir = "BusinessCase_Analysis/1_Data_Categorization/1.1_Preprocessing/"

# Visualize text length distribution
plt.figure(figsize=(10, 6))
plt.hist(df['original_length'], bins=50, alpha=0.5, label='Original')
plt.hist(df['cleaned_length'], bins=50, alpha=0.5, label='Cleaned')
plt.xlabel('Text Length')
plt.ylabel('Frequency')
plt.title('Distribution of Text Length Before and After Preprocessing')
plt.legend()
plt.savefig(os.path.join(output_dir, 'text_length_distribution.png'))  # Updated path
plt.close()
print("Text length distribution chart saved to:", os.path.join(output_dir, 'text_length_distribution.png'))

# Create word clouds for original and cleaned text
def create_wordcloud(text, title):
    """
    Create and save a word cloud visualization for the given text.
    """
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(title)
    plt.savefig(os.path.join(output_dir, f'{title.lower().replace(" ", "_")}.png'))  # Updated path
    plt.close()
    print(f"Word cloud '{title}' saved to:", os.path.join(output_dir, f'{title.lower().replace(" ", "_")}.png'))

create_wordcloud(' '.join(df['content_en']), 'Original Text Word Cloud')
create_wordcloud(' '.join(df['cleaned_content']), 'Cleaned Text Word Cloud')

# Validate preprocessing and sentiment analysis
sample_size = min(100, len(df))
sample = df.sample(sample_size)

def validate_preprocessing(row):
    """
    Validate if preprocessing was effective and sentiment remained consistent.
    """
    original = set(row['content_en'].lower().split())
    cleaned = set(row['cleaned_content'].split())
    removed = original - cleaned
    sentiment_changed = row['sentiment_difference'] > 0.2
    return len(removed) > 0 and len(cleaned) > 0 and not sentiment_changed

validation_results = sample.apply(validate_preprocessing, axis=1)
validation_percentage = (validation_results.sum() / sample_size) * 100

print(f"\nPreprocessing and Sentiment Validation:")
print(f"Percentage of samples with effective preprocessing and consistent sentiment: {validation_percentage:.2f}%")

# Calculate the percentage of cases where sentiment changed significantly
sentiment_change_threshold = 0.2
significant_changes = (df['sentiment_difference'] > sentiment_change_threshold).sum()
significant_change_percentage = (significant_changes / len(df)) * 100

print(f"\nSentiment Change Analysis:")
print(f"Percentage of samples with significant sentiment change: {significant_change_percentage:.2f}%")

# Sentiment distribution before and after preprocessing
plt.figure(figsize=(12, 6))
plt.hist(df['original_sentiment'], bins=50, alpha=0.5, label='Original Sentiment')
plt.hist(df['cleaned_sentiment'], bins=50, alpha=0.5, label='Cleaned Sentiment')
plt.xlabel('Sentiment Score')
plt.ylabel('Frequency')
plt.title('Distribution of Sentiment Scores Before and After Preprocessing')
plt.legend()
plt.savefig(os.path.join(output_dir, 'sentiment_distribution_comparison.png'))  # Updated path
plt.close()
print("Sentiment distribution chart saved to:", os.path.join(output_dir, 'sentiment_distribution_comparison.png'))

# Document the preprocessing and sentiment analysis steps in the project guide file
with open("bc_project_guide.md", "a") as file:
    file.write("\n## Text Preprocessing and Sentiment Analysis\n")
    file.write("### Text Preprocessing Function\n")
    file.write("We defined an advanced function `preprocess_text` to perform text cleaning while preserving context. This function:\n")
    file.write("- Converts text to lowercase\n")
    file.write("- Removes URLs\n")
    file.write("- Keeps alphanumeric characters, spaces, and important punctuation\n")
    file.write("- Tokenizes the text\n")
    file.write("- Lemmatizes words and removes stopwords, but keeps important medical terms and negations\n")
    file.write("- Joins the cleaned tokens back into a single string\n\n")
    file.write("We applied this function to the `content_en` column of the DataFrame, creating a new column `cleaned_content` with the cleaned text.\n")
    file.write("\n### Sentiment Analysis\n")
    file.write("We used TextBlob to perform sentiment analysis on both the original and cleaned text, creating `original_sentiment` and `cleaned_sentiment` columns.\n")
    file.write("\nWe performed additional analysis:\n")
    file.write("- Compared text lengths before and after preprocessing\n")
    file.write("- Visualized text length distribution (saved as 'text_length_distribution.png')\n")
    file.write("- Created word clouds for original and cleaned text (saved as PNG files)\n")
    file.write("- Validated the preprocessing and sentiment consistency on a sample of the data\n")
    file.write(f"- Preprocessing was effective and sentiment remained consistent for {validation_percentage:.2f}% of the sampled data\n")
    file.write(f"- {significant_change_percentage:.2f}% of samples showed significant sentiment change after preprocessing\n")
    file.write("- Visualized the distribution of sentiment scores before and after preprocessing (saved as 'sentiment_distribution_comparison.png')\n")

print("Preprocessing, sentiment analysis complete, and documentation updated.")


[nltk_data] Downloading package punkt to C:\Users\blc/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\blc/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\blc/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


KeyboardInterrupt: 

In [None]:
df.head()

Unnamed: 0,country_code,type,created_at,content_en,cleaned_content,original_length,cleaned_length,sentiment_score,original_sentiment,cleaned_sentiment,sentiment_difference
0,br,opinion,2024-04-06,"A doctor who is not very inclusive, it seems t...",doctor not inclusive seems want immediately di...,1018,584,-0.116162,-0.049621,-0.116162,0.06654
1,br,opinion,2024-04-12,Although the online schedule showed availabili...,although online schedule showed availability 1...,154,115,0.1,0.1,0.1,0.0
2,br,opinion,2024-04-22,Didn't answer my wife. After traveling more th...,didnt answer wife . traveling three hour even ...,246,166,-0.266667,0.04,-0.266667,0.306667
3,br,opinion,2024-04-27,\nHe works at the Santo Amaro rehabilitation c...,work santo amaro rehabilitation center not pre...,624,420,-0.202778,-0.219444,-0.202778,0.016667
4,br,opinion,2024-04-29,This psychologist canceled the 1st appointment...,psychologist canceled 1st appointment late sec...,636,390,0.057143,0.057143,0.057143,0.0


versao 2

## 2. Initial Exploratory Analysis