In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
base_path = '/kaggle/input/feedback-prize-english-language-learning/'
train_data = pd.read_csv(base_path + 'train.csv')
test_data = pd.read_csv(base_path + 'test.csv')
sample = pd.read_csv(base_path + 'sample_submission.csv')

**XGBOOST**

In [3]:
X = train_data['full_text']
y = train_data[['cohesion','syntax','vocabulary','phraseology','grammar','conventions']]
X_test = test_data['full_text']

In [4]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

STOPWORDS = set(stopwords.words('english'))
ps = PorterStemmer()

def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

def data_preprocess(text):
    text = text.strip() #removes blank spaces before and after the text
    text = re.sub(r'\n', '', text) #regex to replace the new line characters with empty
    text = text.lower() #lower case conversion
    text = ps.stem(text) #stem the words
    text = remove_stopwords(text)
    return text


X= X.apply(data_preprocess)
X_test = X_test.apply(data_preprocess)

In [5]:
tfidf_vectorizer = TfidfVectorizer(max_features = 15000, ngram_range=(2,2))
X_train = tfidf_vectorizer.fit_transform(X)
X_test = tfidf_vectorizer.transform(X_test)

In [6]:
# count_vectorizer = CountVectorizer(max_features = 15000)
# X = count_vectorizer.fit_transform(X)
# X_test = count_vectorizer.fit_transform(X_test)

In [7]:
from xgboost import XGBRegressor

def xgboost_syntax(X,y,x_sub):
   
    xgb = XGBRegressor(objective ='reg:squarederror')
    xgb.fit(X, y)
    return  xgb.predict(x_sub)


In [8]:
#'cohesion','syntax','vocabulary','phraseology','grammar','conventions'

cohesion = xgboost_syntax(X_train,y['cohesion'],X_test)
syntax = xgboost_syntax(X_train,y['syntax'],X_test)
vocabulary = xgboost_syntax(X_train,y['vocabulary'],X_test)
phraseology = xgboost_syntax(X_train,y['phraseology'],X_test)
grammar = xgboost_syntax(X_train,y['grammar'],X_test)
conventions = xgboost_syntax(X_train,y['conventions'],X_test)

In [9]:
sample['cohesion']=cohesion
sample['syntax']=syntax
sample['vocabulary']=vocabulary
sample['phraseology']=phraseology
sample['grammar']=grammar
sample['conventions']=conventions
sample['text_id']=test_data['text_id']
sample.to_csv('submission.csv',index=False)

In [10]:
sample

Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,2.986126,2.832387,3.320928,3.156395,2.896758,3.011446
1,000BAD50D026,2.993685,2.77001,3.008603,2.964389,2.38243,2.760096
2,00367BB2546B,3.002274,3.680946,3.062879,2.849426,3.997732,3.680632
