In [105]:
import numpy as np
import pandas as pd

df = pd.read_csv('./movie_scores.csv')

In [5]:
df.head()

Unnamed: 0,critic,movie,org,score
0,Steve Newton,Pitch Black,Georgia Straight,1.5/5
1,Tim Brayton,Pitch Black,Antagony & Ecstasy,7/10
2,Jeffrey M. Anderson,Pitch Black,Common Sense Media,3/5
3,John A. Nesbit,Pitch Black,Old School Reviews,B
4,Blake French,Pitch Black,Filmcritic.com,2/5


In [106]:
# Function that accepts string score in many different formats and returns a normalized fractional representation of that score
def toNormalized(score):
    # search for pattern '3(.0) / 5(.0)'
    if re.match('\A.?(\d+[\.\d]*) */ *(\d+[\.\d]*)', score):
        groups = re.search('\A.?(\d+[\.\d]*) */ *(\d+[\.\d]*)', score)
        return float(groups.group(1))/float(groups.group(2))
    
    # search for pattern '3(.0) of 5(.0)'
    elif re.match('(\d) of (\d)', score):
        groups = re.search('(\d) of (\d)', score)
        return float(groups.group(1))/float(groups.group(2))
    
    # search for pattern '3(.0) out of 5(.0)'
    elif re.match('(\d[.\d]*) out of (\d[.\d]*)', score):
        groups = re.search('(\d[.\d]*) out of (\d[.\d]*)', score)
        return float(groups.group(1))/float(groups.group(2))
    
    # search for pattern '3(.0) stars out of 5(.0)'
    elif re.match('(\d[.\d]*) stars out of (\d[.\d]*)', score):
        groups = re.search('(\d[.\d]*) stars out of (\d[.\d]*)', score)
        return float(groups.group(1))/float(groups.group(2))

    # search for standalone numbers
    elif re.match('\A(\.?\d+\.?\d*)\Z', score):
        groups = re.search('\A(\.?\d+\.?\d*)\Z', score)
        number = float(groups.group(1))
        if number >= 0 and number <= 5:
            return number/5
        elif number > 5 and number <= 10:
            return number/10
        elif number > 10:
            groups = re.search('([1-9])10', str(number))
            if groups is not None:
                return float(groups.group(1))/10
            else:
                return None
    
    # search for pattern '3(.0) stars'
    elif re.match('(\d[\.\d]*) stars', score):
        groups = re.search('(\d[\.\d]*) stars', score)
        return float(groups.group(1))/5
    
    # search for pattern '4(.0):5(.0)'
    elif re.match('(\d\.?\d*):(\d\.?\d*)', score):
        groups = re.search('(\d\.?\d*):(\d\.?\d*)', score)
        return float(groups.group(1))/float(groups.group(2))
    
    # search for pattern '***(1/2)'
    elif re.match('(\*+) ?(\d?)/?(\d?)', score):
        groups = re.search('(\*+) ?(\d?)/?(\d?)', score)
        
        if groups.group(2) == '':
            return len(list(groups.group(1)))/5
        else:
            return (len(list(groups.group(1))) + (float(groups.group(2))/float(groups.group(3))))/5
    
    # search for pattern 'A(+)(-minus)(+plus)'
    elif re.match('(\A[ABCDFabcdf])[ -]?(.*)', score):
        groups = re.search('(\A[ABCDFabcdf])[ -]?(.*)', score)
    
        if groups.group(1).upper() == 'A':
            if groups.group(2) in ['+', 'plus', 'PLUS']:
                return 1
            elif groups.group(2) == '':
                return .925
            else:
                return .85
        elif groups.group(1).upper() == 'B':
            if groups.group(2) in ['+', 'plus', 'PLUS']:
                return .775
            elif groups.group(2) == '':
                return .7
            else:
                return .625
        elif groups.group(1).upper() == 'C':
            if groups.group(2) in ['+', 'plus', 'PLUS']:
                return .55
            elif groups.group(2) == '':
                return .475
            else:
                return .4
        elif groups.group(1).upper() == 'D':
            if groups.group(2) in ['+', 'plus', 'PLUS']:
                return .325
            elif groups.group(2) == '':
                return .25
            else:
                return .175
        elif groups.group(1).upper() == 'F':
            return 0
        else:
            return None
    else:
        return None

In [107]:
# Normalize the scores for every movie/critic
for i in range(len(df['score'])):
    df['score'][i] = toNormalized(df['score'][i])

In [109]:
# Remove any movies with scores greater than 1 or None values
df_filtered = df[df.score <= 1]
df_filtered.score.unique()

array([0.3, 0.7, 0.6, 0.4, 0.625, 0, 0.325, 0.475, 0.75, 0.8, 0.875, 0.1,
       0.5, 0.775, 0.925, 0.9, 1.0, 0.2, 0.02, 0.55, 0.375, 0.25, 0.85,
       0.03, 0.45, 0.93, 0.58, 0.65, 0.8400000000000001, 0.62, 0.76, 0.72,
       0.09, 0.6980000000000001, 0.125, 0.6875, 0.64, 0.07,
       0.8800000000000001, 0.95, 0.05, 0.78, 0.9400000000000001, 0.35,
       0.77, 0.79, 0.601, 0.74, 0.63, 0.6900000000000001, 0.67,
       0.8300000000000001, 0.08, 0.6799999999999999, 0.8699999999999999,
       0.9869999999999999, 0.97, 0.5900000000000001, 0.7889999999999999,
       0.89, 0.15, 0.865, 0.8099999999999999, 0.9199999999999999, 0.82,
       0.600024, 0.9880000000000001, 0.9099999999999999, 0.575, 0.86,
       0.545, 0.71, 0.01, 0.8124, 0.9560000000000001, 0.06, 0.697, 0.889,
       0.9119999999999999, 0.6666666666666666, 0.945, 0.96, 0.898, 0.73,
       0.8765000000000001, 0.825, 0.49000000000000005, 0.99, 0.845, 0.04,
       0.975, 0.8880000000000001, 0.9890000000000001, 0.6599999999999999,
 

In [117]:
# Reset dataframe index so that the indexes are continuous
df_filtered = df_filtered.reset_index()[['critic', 'movie', 'org', 'score']]
df_filtered[330:340]

Unnamed: 0,critic,movie,org,score
330,Lisa Schwarzbaum,The Skin I Live In,Entertainment Weekly,0.775
331,Kate Erbland,The Skin I Live In,Film School Rejects,0.775
332,Jason Bailey,The Skin I Live In,DVDTalk.com,0.6
333,Kevin Harley,Haywire,Total Film,0.6
334,Fred Topel,Haywire,CraveOnline,0.9
335,Harvey S. Karten,Drinking Buddies,Modamag.com,0.7
336,Jesse Cataldo,Drinking Buddies,Slant Magazine,0.625
337,Vincent Mancini,Drinking Buddies,FilmDrunk,0.55
338,Alexander Lowe,Drinking Buddies,We Got This Covered,0.7
339,Nathan Rabin,Drinking Buddies,The Dissolve,0.5


In [119]:
# Truncate long float decimal values to values with no longer than 3 decimal places
for i in range(len(df_filtered.score)):
    df_filtered.score[i] = float('%.3f' % df_filtered.score[i])

In [120]:
df_filtered.score.unique()

array([0.3, 0.7, 0.6, 0.4, 0.625, 0.0, 0.325, 0.475, 0.75, 0.8, 0.875,
       0.1, 0.5, 0.775, 0.925, 0.9, 1.0, 0.2, 0.02, 0.55, 0.375, 0.25,
       0.85, 0.03, 0.45, 0.93, 0.58, 0.65, 0.84, 0.62, 0.76, 0.72, 0.09,
       0.698, 0.125, 0.688, 0.64, 0.07, 0.88, 0.95, 0.05, 0.78, 0.94,
       0.35, 0.77, 0.79, 0.601, 0.74, 0.63, 0.69, 0.67, 0.83, 0.08, 0.68,
       0.87, 0.987, 0.97, 0.59, 0.789, 0.89, 0.15, 0.865, 0.81, 0.92,
       0.82, 0.988, 0.91, 0.575, 0.86, 0.545, 0.71, 0.01, 0.812, 0.956,
       0.06, 0.697, 0.889, 0.912, 0.667, 0.945, 0.96, 0.898, 0.73, 0.877,
       0.825, 0.49, 0.99, 0.845, 0.04, 0.975, 0.888, 0.989, 0.66, 0.895,
       0.32, 0.952, 0.761, 0.978, 0.965, 0.521, 0.46, 0.997, 0.921, 0.979,
       0.479, 0.935, 0.915, 0.995, 0.976, 0.56, 0.587, 0.569, 0.175,
       0.722, 0.985, 0.542, 0.525, 0.502, 0.52, 0.832, 0.833, 0.938,
       0.821, 0.762, 0.98, 0.986, 0.914, 0.919, 0.289, 0.609, 0.964,
       0.675, 0.721, 0.974, 0.578, 0.555, 0.876, 0.795, 0.902, 0.826,


In [123]:
# Write filtered table to a csv file
df_filtered.to_csv('./filtered_scores.csv', index=False)