In [14]:
import pandas as pd
import boto3
from io import StringIO
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import pickle
from service.properties.secrets import AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY
from service.properties.local_properties import BUCKET_NAME, REGION_NAME, DATA_KEY, MODEL_KEY

### Load dataset from Amazon S3

In [15]:
s3 = boto3.client(
    's3',
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    region_name=REGION_NAME
)

In [16]:
obj = s3.get_object(Bucket=BUCKET_NAME, Key=DATA_KEY)
string_data = obj['Body'].read().decode('utf-8')
df = pd.read_csv(StringIO(string_data))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 720 entries, 0 to 719
Data columns (total 12 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Id         720 non-null    int64  
 1   Title      720 non-null    object 
 2   Year       720 non-null    int64  
 3   Rating     714 non-null    float64
 4   Views      715 non-null    float64
 5   Likes      715 non-null    float64
 6   Minutes    720 non-null    int64  
 7   Tagline    667 non-null    object 
 8   Language   720 non-null    object 
 9   Countries  720 non-null    object 
 10  Directors  720 non-null    object 
 11  Cast       720 non-null    object 
dtypes: float64(3), int64(3), object(6)
memory usage: 67.6+ KB


#### Preprocessing

In [7]:
df_subset = df[['Title','Year','Rating','Views','Likes', 'Minutes']].dropna()
df_subset['Views'] = df_subset['Views'].astype(int)
df_subset['Likes'] = df_subset['Likes'].astype(int)
df_subset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 713 entries, 0 to 719
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Title    713 non-null    object 
 1   Year     713 non-null    int64  
 2   Rating   713 non-null    float64
 3   Views    713 non-null    int64  
 4   Likes    713 non-null    int64  
 5   Minutes  713 non-null    int64  
dtypes: float64(1), int64(4), object(1)
memory usage: 39.0+ KB


In [8]:
df_subset.head()

Unnamed: 0,Title,Year,Rating,Views,Likes,Minutes
0,Barbie,2023,3.9,3703904,1605943,114
1,Parasite,2019,4.6,3832192,2191347,133
2,Everything Everywhere All at Once,2022,4.3,2934908,1485981,140
3,Fight Club,1999,4.3,3763446,1737513,139
4,La La Land,2016,4.1,3231719,1439416,129


In [9]:
X = df_subset[['Year','Views','Likes','Minutes']]
y = df_subset['Rating']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

### Train LR and upload pickle to Amazon S3

In [11]:
lr=LinearRegression().fit(X_train, y_train)
lr.score(X_test,y_test)

0.5797171085240502

In [12]:
serialized_model = pickle.dumps(lr)
s3.put_object(Bucket=BUCKET_NAME, Key=MODEL_KEY, Body=serialized_model)

{'ResponseMetadata': {'RequestId': 'RAGNMDXJDE35FM7W',
  'HostId': 'MIkEGKSPA9s/ES7evJi70gj2kJjiP/TZHjqF6lQ6+uqqT5L7yb3aIyOOob7Z/XI54HiEzrw8fpA=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'MIkEGKSPA9s/ES7evJi70gj2kJjiP/TZHjqF6lQ6+uqqT5L7yb3aIyOOob7Z/XI54HiEzrw8fpA=',
   'x-amz-request-id': 'RAGNMDXJDE35FM7W',
   'date': 'Fri, 07 Jun 2024 15:00:43 GMT',
   'x-amz-version-id': 't4dSad8Dpn_W6i128pkvGMXT_ugyqCTM',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"0930c602651ca9034d5f385b069186a9"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"0930c602651ca9034d5f385b069186a9"',
 'ServerSideEncryption': 'AES256',
 'VersionId': 't4dSad8Dpn_W6i128pkvGMXT_ugyqCTM'}

#### Test predictions

In [76]:
X_test.iloc[-2].values.reshape(1, -1)

array([[   1990, 2337261,  550460,     103]])

In [78]:
df[(df['Year'] == 1990) & (df['Views'] == 2337261)]

Unnamed: 0,Id,Title,Year,Rating,Views,Likes,Minutes,Language,Countries,Directors,Cast
153,51381,Home Alone,1990,3.8,2337261,550460,103,English,['USA'],['Chris Columbus'],"['Macaulay Culkin', 'Joe Pesci', 'Daniel Stern..."


In [77]:
lr.predict(X_test)[-2]

3.607636669378028

##### LR model analysis (further work)

In [73]:
# import seaborn as sns
# import statsmodels.api as sm
# 
# X.corr()

In [74]:
# sns.pairplot(X)

In [75]:
# X_with_const = sm.add_constant(X_train)

# model = sm.OLS(y_train, X_with_const).fit()
# model.summary()