# Import & load df

In [17]:
import pandas as pd
import html

import re

In [18]:
pd.set_option('display.max_columns', None)
art_df = pd.read_csv("../data/WikiArt-Emotions/WikiArt-Emotions-All.tsv", sep='\t')

# Apply functions from 01.Image_Extraction notebook

In [19]:
%run 01.Image_Extraction.ipynb

In [20]:
art_df = clean_html_text(art_df)

In [21]:
art_df = clean_category(art_df)

# Clean years

In [22]:
def century_clean(df):
    """
    this function performs the cleanup of the values in the "Year" column. 
    It uses a dictionary to map each abbreviated century to a numeric value 
    corresponding to the average between the beginning and the approximate 
    end of the century.
    """
    
    cambios = {
        'XVI cent.': '1550',
        'XV-XVI cent.': '1550',
        'XVII cent.': '1650',
        'XVI-XVII cent.': '1600',
        'XVIII cent.': '1750',
        'XVIII-XIX cent.': '1800',
        'XIX cent.': '1850',
        'XIX-XX cent.': '1950',
        'XVII-XVIII cent.': '1750',
        'XX-XXI cent.': '2000',
        'XV cent.': '1450',
        'XIV-XV cent.': '1350',
        'XX cent.': '1900'
}
    
    df['Year'] = df['Year'].replace(cambios)
    
    return df

In [23]:
art_df = century_clean(art_df)

In [24]:
def year_clean(row):
    """
    this function performs a cleanup of the Year column. 
    It converts ranges of years to a single value as the mean rounded up.
    """
    value = row[5]
    try:
        start_year, end_year = map(int, value.split('-'))
        average = (start_year + end_year) // 2 + (start_year + end_year) % 2
        return average
    except (ValueError, AttributeError):
        return value

In [25]:
def apply_year_clean(df):
    """
    This function applies the year_clean function to the entire df to obtain
    a df with a new column containing the cleaned values of the "Year" column.
    """
    df["Year_2"] = df.apply(year_clean, axis=1)
    return df

In [26]:
art_df = apply_year_clean(art_df)

In [27]:
def rename__reorder_columns(df):
    """
    This function renames and orders the columns of the df.
    """
    new_column_names = {
        'ID': 'ID',
        'Style': 'Style',
        'Category': 'Movement',
        'Artist': 'Artist',
        'Title': 'Title',
        'Year': 'Year',
        'Year_2': 'Year_2',
        'Is painting': 'Painting_Y_N',
        'Face/body': 'Face-body',
        'Ave. art rating': 'Avg_rating',
        'Art (image+title): agreeableness': 'IT_agreeableness',
        'Art (image+title): anger': 'IT_anger',
        'Art (image+title): anticipation': 'IT_anticipation',
        'Art (image+title): arrogance': 'IT_arrogance',
        'Art (image+title): disagreeableness': 'IT_disagreeableness',
        'Art (image+title): disgust': 'IT_disgust',
        'Art (image+title): fear': 'IT_fear',
        'Art (image+title): gratitude': 'IT_gratitude',
        'Art (image+title): happiness': 'IT_happiness',
        'Art (image+title): humility': 'IT_humility',
        'Art (image+title): love': 'IT_love',
        'Art (image+title): optimism': 'IT_optimism',
        'Art (image+title): pessimism': 'IT_pessimism',
        'Art (image+title): regret': 'IT_regret',
        'Art (image+title): sadness': 'IT_sadness',
        'Art (image+title): shame': 'IT_shame',
        'Art (image+title): shyness': 'IT_shyness',
        'Art (image+title): surprise': 'IT_surprise',
        'Art (image+title): trust': 'IT_trust',
        'Art (image+title): neutral': 'IT_neutral',
        'ImageOnly: agreeableness': 'I_agreeableness',
        'ImageOnly: anger': 'I_anger',
        'ImageOnly: anticipation': 'I_anticipation',
        'ImageOnly: arrogance': 'I_arrogance',
        'ImageOnly: disagreeableness': 'I_disagreeableness',
        'ImageOnly: disgust': 'I_disgust',
        'ImageOnly: fear': 'I_fear',
        'ImageOnly: gratitude': 'I_gratitude',
        'ImageOnly: happiness': 'I_happiness',
        'ImageOnly: humility': 'I_humility',
        'ImageOnly: love': 'I_love',
        'ImageOnly: optimism': 'I_optimism',
        'ImageOnly: pessimism': 'I_pessimism',
        'ImageOnly: regret': 'I_regret',
        'ImageOnly: sadness': 'I_sadness',
        'ImageOnly: shame': 'I_shame',
        'ImageOnly: shyness': 'I_shyness',
        'ImageOnly: surprise': 'I_surprise',
        'ImageOnly: trust': 'I_trust',
        'ImageOnly: neutral': 'I_neutral',
        'TitleOnly: agreeableness': 'T_agreeableness',
        'TitleOnly: anger': 'T_anger',
        'TitleOnly: anticipation': 'T_anticipation',
        'TitleOnly: arrogance': 'T_arrogance',
        'TitleOnly: disagreeableness': 'T_disagreeableness',
        'TitleOnly: disgust': 'T_disgust',
        'TitleOnly: fear': 'T_fear',
        'TitleOnly: gratitude': 'T_gratitude',
        'TitleOnly: happiness': 'T_happiness',
        'TitleOnly: humility': 'T_humility',
        'TitleOnly: love': 'T_love',
        'TitleOnly: optimism': 'T_optimism',
        'TitleOnly: pessimism': 'T_pessimism',
        'TitleOnly: regret': 'T_regret',
        'TitleOnly: sadness': 'T_sadness',
        'TitleOnly: shame': 'T_shame',
        'TitleOnly: shyness': 'T_shyness',
        'TitleOnly: surprise': 'T_surprise',
        'TitleOnly: trust': 'T_trust',
        'TitleOnly: neutral': 'T_neutral'
    }

    column_order = [
        'ID',
        'Style',
        'Movement',
        'Artist',
        'Title',
        'Year',
        'Year_2',
        'Painting_Y_N',
        'Face-body',
        'Avg_rating',        
        # Image + Title
            # positive
        'IT_gratitude',
        'IT_happiness',
        'IT_humility',
        'IT_love',
        'IT_optimism',
        'IT_trust',        
            # neutral             
        'IT_agreeableness',
        'IT_anticipation',
        'IT_disagreeableness',
        'IT_shyness',
        'IT_surprise',
        'IT_neutral',
            # negative        
        'IT_anger',
        'IT_arrogance',        
        'IT_disgust',
        'IT_fear',
        'IT_pessimism',
        'IT_regret',
        'IT_sadness',
        'IT_shame',
        
        # Image
            # positive        
        'I_gratitude',
        'I_happiness',
        'I_humility',
        'I_love',
        'I_optimism',
        'I_trust',        
             # neutral                  
        'I_agreeableness',
        'I_anticipation',
        'I_disagreeableness',
        'I_shyness',
        'I_surprise',
        'I_neutral',
            # negative              
        'I_anger',
        'I_arrogance',        
        'I_disgust',
        'I_fear',
        'I_pessimism',
        'I_regret',
        'I_sadness',
        'I_shame',    
        
        # Title
            # positive        
        'T_gratitude',
        'T_happiness',
        'T_humility',
        'T_love',
        'T_optimism',
        'T_trust',        
             # neutral                  
        'T_agreeableness',
        'T_anticipation',
        'T_disagreeableness',
        'T_shyness',
        'T_surprise',
        'T_neutral',
            # negative              
        'T_anger',
        'T_arrogance',        
        'T_disgust',
        'T_fear',
        'T_pessimism',
        'T_regret',
        'T_sadness',
        'T_shame',  
            
    ]

    df = df.rename(columns=new_column_names)
    df = df[column_order]
    return df


In [28]:
art_df = rename__reorder_columns(art_df)

In [29]:
def avg_feelings(df):
    """
    This function calculates the average of the columns with positive, negative and neutral sentiments.
    """
    avg_pos = df[['IT_gratitude', 'IT_happiness', 'IT_humility', 'IT_love', 'IT_optimism', 'IT_trust']].mean(axis=1).round(3)
    df['Avg_pos'] = avg_pos

    avg_neu = df[['T_agreeableness', 'T_anticipation', 'T_disagreeableness', 'T_shyness', 'T_surprise', 'T_neutral']].mean(axis=1).round(3)
    df['Avg_neu'] = avg_neu

    avg_neg = df[['T_anger', 'T_arrogance', 'T_disgust', 'T_fear', 'T_pessimism', 'T_regret', 'T_sadness', 'T_shame']].mean(axis=1).round(3)
    df['Avg_neg'] = avg_neg

    return df

In [30]:
art_df = avg_feelings (art_df)

In [31]:
art_df.sample()

Unnamed: 0,ID,Style,Movement,Artist,Title,Year,Year_2,Painting_Y_N,Face-body,Avg_rating,IT_gratitude,IT_happiness,IT_humility,IT_love,IT_optimism,IT_trust,IT_agreeableness,IT_anticipation,IT_disagreeableness,IT_shyness,IT_surprise,IT_neutral,IT_anger,IT_arrogance,IT_disgust,IT_fear,IT_pessimism,IT_regret,IT_sadness,IT_shame,I_gratitude,I_happiness,I_humility,I_love,I_optimism,I_trust,I_agreeableness,I_anticipation,I_disagreeableness,I_shyness,I_surprise,I_neutral,I_anger,I_arrogance,I_disgust,I_fear,I_pessimism,I_regret,I_sadness,I_shame,T_gratitude,T_happiness,T_humility,T_love,T_optimism,T_trust,T_agreeableness,T_anticipation,T_disagreeableness,T_shyness,T_surprise,T_neutral,T_anger,T_arrogance,T_disgust,T_fear,T_pessimism,T_regret,T_sadness,T_shame,Avg_pos,Avg_neu,Avg_neg
2568,5772823bedc2cb3880f6e6ad,Modern Art,Pop-Art,Peter Max,Love,1969,1969,yes,face,2.0,0.091,0.545,0.0,0.818,0.364,0.182,0.0,0.182,0.0,0.0,0.273,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.636,0.0,0.545,0.364,0.182,0.0,0.273,0.0,0.0,0.364,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.727,0.0,0.818,0.273,0.182,0.0,0.182,0.0,0.0,0.182,0.0,0.091,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333,0.061,0.011


In [32]:
#art_df.to_csv("../data/WikiArt-Emotions-Clean.csv", index=False)

In [36]:
art_df.Year_2

0       1889
1       1984
2       1906
3       1920
4       1828
        ... 
4100    1957
4101    1934
4102    1979
4103    1909
4104    1908
Name: Year_2, Length: 4105, dtype: object