In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/learning-agency-lab-automated-essay-scoring-2/sample_submission.csv
/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv
/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv


In [3]:
import pandas as pd
import numpy as np

In [4]:
test = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv')

In [5]:
train = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv')

Preprocessing Text

In [6]:
import re

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply the preprocessing function to the training data
train['clean_text'] = train['full_text'].apply(preprocess_text)
train[['full_text', 'clean_text']].head()

Unnamed: 0,full_text,clean_text
0,Many people have car where they live. The thin...,many people have car where they live the thing...
1,I am a scientist at NASA that is discussing th...,i am a scientist at nasa that is discussing th...
2,People always wish they had the same technolog...,people always wish they had the same technolog...
3,"We all heard about Venus, the planet without a...",we all heard about venus the planet without al...
4,"Dear, State Senator\n\nThis is a letter to arg...",dear state senator this is a letter to argue i...


In [7]:
train.columns

Index(['essay_id', 'full_text', 'score', 'clean_text'], dtype='object')

Use the TF-IDF (Term Frequency-Inverse Document Frequency) vectorization

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform the training data
tfidf_train = vectorizer.fit_transform(train['clean_text'])

# Display the shape of the transformed data
tfidf_train.shape

(17307, 5000)

Train the model using the TF-IDF features and the essay scores.

In [9]:
from sklearn.linear_model import Ridge

# Initialize the Ridge Regression model
model = Ridge()

# Train the model on the training data
model.fit(tfidf_train, train['score'])

In [12]:
# Apply the preprocessing function to the test data
test['clean_text'] = test['full_text'].apply(preprocess_text)

# Transform the test data using the trained TF-IDF vectorizer
tfidf_test = vectorizer.transform(test['clean_text'])

# Use the trained model to predict the scores for the test data
predicted_scores = model.predict(tfidf_test)

In [13]:
predicted_scores

array([2.10044785, 2.87664519, 4.74899694])

The predicted scores for the essays in the test :

- Essay 1: approximately 2.10
- Essay 2: approximately 2.87
- Essay 3: approximately 4.75

Create Submission csv

In [15]:
import pandas as pd

# Create a DataFrame to store the essay IDs and the predicted scores
submission_df = pd.DataFrame({
    'essay_id': test['full_text'],
    'predicted_score': predicted_scores
})

# Save the DataFrame to a CSV file
submission_df.to_csv('submission.csv', index=False)

In [16]:
submission_df

Unnamed: 0,essay_id,predicted_score
0,Many people have car where they live. The thin...,2.100448
1,I am a scientist at NASA that is discussing th...,2.876645
2,People always wish they had the same technolog...,4.748997
