# Traitement des Fichiers CSV traités avec l'analyse de sentiment
Ce notebook récupère les fichiers traités sur la machine distante et permet de les lire, 
les labélliser et les concaténer entre eux. 
A la fin, le fichier all.csv est le fichier contenant toute mon analyse de sentiment en termes de polarité, avec l'attribution d'un sentiment pour chaque tweet. Ce fichier va nous permettre de pousuivre l'analyse de la polarité pour une eventuelle représentation graphique des groupes de sentiments (T-SNE Minst); et une analyse des émotions. 

In [1]:
import pandas as pd
import re
import io 

In [10]:
# on récup les fichiers traités 
file_paths = [
    r"C:\Users\alyas\Desktop\TER\test_sentiment_analysis\final\cannabis_all.csv", 
    r"C:\Users\alyas\Desktop\TER\test_sentiment_analysis\final\tweets_cancer_fasting_all.csv", 
    r"C:\Users\alyas\Desktop\TER\test_sentiment_analysis\final\tweets_cancer_sport_all.csv"
]

dfs = []

In [11]:

def preprocess_csv(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            cleaned_content = re.sub(r'[^\x00-\x7F]+', '', file.read())
        df = pd.read_csv(io.StringIO(cleaned_content))
        # Your data processing code here
        df['Sentiment'] = df[['positive', 'neutral', 'negative']].idxmax(axis=1)
        df['Sentiment'] = df['Sentiment'].map({'positive': 'pos', 'neutral': 'neutral', 'negative': 'neg'})
        emotion_columns = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust']
        df['Emotion'] = df[emotion_columns].idxmax(axis=1)
        df['Emotion'] = df['Emotion'].map({
            'anger': 'angry',
            'anticipation': 'anticipation',
            'disgust': 'disgust',
            'fear': 'fear',
            'joy': 'joy',
            'love': 'love',
            'optimism': 'optimism',
            'pessimism': 'pessimism',
            'sadness': 'sadness',
            'surprise': 'surprise',
            'trust': 'trust'
        })
        return df
    except Exception as e:
        print(f"Error: {e}")
        return None


for file_path in file_paths:
    df = preprocess_csv(file_path)
    if df is not None:
        dfs.append(df)
        print(file_path, len(df))

# Now dfs contains the processed DataFrames for all the CSV files


C:\Users\alyas\Desktop\TER\test_sentiment_analysis\final\cannabis_all.csv 51268
C:\Users\alyas\Desktop\TER\test_sentiment_analysis\final\tweets_cancer_fasting_all.csv 1035
C:\Users\alyas\Desktop\TER\test_sentiment_analysis\final\tweets_cancer_sport_all.csv 591


In [12]:
# Concatenate 
result_df = pd.concat(dfs, ignore_index=True)

total_length = sum(len(df) for df in dfs)
assert len(result_df) == total_length, "Length mismatch! Check!"

# Display the concatenated DataFrame
print(result_df)
print(result_df.columns)

                        id                                               Text  \
0      1062072791602786305  What are the benefits of #cannabis oil? They a...   
1       578675630646042624  "We found that almost 25%-35% of the #mice wer...   
2       590632386586611713  More research supporting that #cannabis shrink...   
3       657614802848739330  "Medical Marijuana Helped Save This Child From...   
4       587337763617382400  Babies inoperable tumor treated with cannabis ...   
...                    ...                                                ...   
52889  1019483156871831552  Football #training over 5years is associated w...   
52890   632563203348791296  Fundraiser set for Plainville CrossFit 508 tra...   
52891  1442048470710460417  Best of luck to our very good friend David who...   
52892   978166544999075840  #GabrieleGrunewald   fighter ... sportswoman ....   
52893   817309411358179329  On #CorkToday  Drink Driving, #Cancer Adverts,...   

       positive  neutral  n

In [13]:
#export
result_df.to_csv(r"C:\Users\alyas\Desktop\TER\test_sentiment_analysis\final\sentiment_emotion_analysis_cancer.csv", index=False)

In [14]:
#reading
r = pd.read_csv(r'C:\Users\alyas\Desktop\TER\test_sentiment_analysis\final\sentiment_emotion_analysis_cancer.csv', sep=",")
r

Unnamed: 0,id,Text,positive,neutral,negative,anger,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust,Sentiment,Emotion
0,1062072791602786305,What are the benefits of #cannabis oil? They a...,0.9291,0.0679,0.0031,0.018493,0.400978,0.013348,0.158499,0.772732,0.062300,0.960550,0.018827,0.014124,0.031441,0.322952,pos,optimism
1,578675630646042624,"""We found that almost 25%-35% of the #mice wer...",0.8404,0.1542,0.0053,0.036988,0.379463,0.036752,0.054410,0.655105,0.017943,0.902406,0.023391,0.025516,0.033069,0.138189,pos,optimism
2,590632386586611713,More research supporting that #cannabis shrink...,0.7185,0.2775,0.0040,0.056147,0.482083,0.057878,0.086017,0.223089,0.009511,0.569962,0.030905,0.026709,0.037462,0.081598,pos,optimism
3,657614802848739330,"""Medical Marijuana Helped Save This Child From...",0.5276,0.4555,0.0169,0.013102,0.178219,0.031741,0.163743,0.253767,0.016345,0.541824,0.089296,0.255838,0.014495,0.032981,pos,optimism
4,587337763617382400,Babies inoperable tumor treated with cannabis ...,0.2615,0.6890,0.0495,0.021553,0.265327,0.040874,0.036170,0.418201,0.010211,0.352869,0.028329,0.064711,0.023945,0.026794,neutral,joy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52889,1019483156871831552,Football #training over 5years is associated w...,0.0380,0.8540,0.1080,0.030314,0.353299,0.077416,0.049767,0.230115,0.006761,0.122942,0.040196,0.083823,0.067873,0.020052,neutral,anticipation
52890,632563203348791296,Fundraiser set for Plainville CrossFit 508 tra...,0.0214,0.6935,0.2850,0.030266,0.070038,0.086480,0.061663,0.140779,0.021229,0.255524,0.379691,0.930155,0.012139,0.018998,neutral,sadness
52891,1442048470710460417,Best of luck to our very good friend David who...,0.9499,0.0470,0.0030,0.013102,0.097531,0.012220,0.065432,0.924974,0.156113,0.966607,0.018471,0.060576,0.012939,0.148206,pos,optimism
52892,978166544999075840,#GabrieleGrunewald fighter ... sportswoman ....,0.9574,0.0387,0.0039,0.017007,0.067766,0.021986,0.023935,0.686919,0.153535,0.955576,0.068616,0.368431,0.008388,0.204168,pos,optimism
