In [26]:
import pandas as pd

df= pd.read_csv("Code_Comment_Seed_Data.csv")
print(df.head())
print("\n")
print(df.shape)
print("\n")
print(df.info())

       Comments                           Surrounding Code Context       Class
0  /*test 529*/  -10.   int res = 0;\n-9.   CURL *curl = NULL;\...  Not Useful
1  /*test 525*/  -2.     fprintf(stderr, "Usage: lib529 [url] [...  Not Useful
2      /*done*/  -10.   multi_add_handle(m, curl);\n-9.   for(;...  Not Useful
3  /*test 529*/  -10.   int res = 0;\n-9.   CURL *curl = NULL;\...  Not Useful
4  /*test 525*/  -2.     fprintf(stderr, "Usage: lib529 [url] [...  Not Useful


(11452, 3)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11452 entries, 0 to 11451
Data columns (total 3 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Comments                  11452 non-null  object
 1   Surrounding Code Context  11452 non-null  object
 2   Class                     11452 non-null  object
dtypes: object(3)
memory usage: 268.5+ KB
None


In [27]:
print(df.isna().sum())

Comments                    0
Surrounding Code Context    0
Class                       0
dtype: int64


In [28]:
total_duplicates = df.duplicated(keep=False).sum()
print(f"\nTotal number of duplicate rows (all columns): {total_duplicates}")
print(df.shape)


Total number of duplicate rows (all columns): 5098
(11452, 3)


In [29]:
# Get duplicate rows and their counts (all columns)
duplicate_counts = df[df.duplicated(keep=False)].groupby(df.columns.tolist()).size().reset_index(name='count')
print("\nDuplicate rows and their counts (all columns):")
print(duplicate_counts)


Duplicate rows and their counts (all columns):
                                               Comments  \
0     /*\nBivariate Cubic Spline Approximation using...   
1     /*\nDelaunay Triangulation Linear Interpolatio...   
2     /*\nFor the initial set of points, just displa...   
3     /*\nNatural Neighbors using Pavel Sakov's nn p...   
4     /*! @file\n!\n! Point-, symbol-, and string-pl...   
...                                                 ...   
1967                                    /*x_source_fe*/   
1968                                    /*x_source_fe*/   
1969                                     /*yepp, done*/   
1970                                  /*|| !string[i]*/   
1971                                             /*})*/   

                               Surrounding Code Context       Class  count  
0     \n// plgridd.c: Plot grids data from irregular...      Useful      2  
1     \n// plgridd.c: Plot grids data from irregular...      Useful      2  
2     \n//! 

In [54]:
df_no_duplicates = df.drop_duplicates(keep='first')
print("\nShape after removing duplicates (all columns):", df_no_duplicates.shape)
print(df_no_duplicates)
df_no_duplicates.to_csv('cleaned_no_duplicates.csv', index=False)


Shape after removing duplicates (all columns): (8326, 11)
                                                Comments  \
0                                           /*test 529*/   
1                                           /*test 525*/   
2                                               /*done*/   
5      /*argv1 = URL\n * argv2 = proxy\n * argv3 = no...   
6                                             /*unused*/   
...                                                  ...   
11447  /*The following document where the background ...   
11448  /*Do all the *safe* initialization - 'safe' me...   
11449  /*And set the rest of the structure to NULL to...   
11450  /*Use png_ptr here, not info_ptr, because by e...   
11451  /*Is the given gamma significantly different f...   

                                Surrounding Code Context       Class  \
0      -10.   int res = 0;\n-9.   CURL *curl = NULL;\...  Not Useful   
1      -2.     fprintf(stderr, "Usage: lib529 [url] [...  Not Useful   
2   

In [32]:

print("Unique labels:", df['Class'].unique())
print("Label counts:\n", df['Class'].value_counts())

Unique labels: ['Not Useful' 'Useful']
Label counts:
 Useful        7063
Not Useful    4389
Name: Class, dtype: int64


In [40]:
# Check for empty or short strings
df['comment_length'] = df['Comments'].str.len()
df['code_snippet_length'] = df['Surrounding Code Context'].str.len()
print("Comments with length < 5:", df[df['comment_length'] < 5][['Comments', 'Class']])
print("Code snippets with length < 5:", df[df['code_snippet_length'] < 5][['Surrounding Code Context', 'Class']])
df = df[df['comment_length'] >= 5]
df = df[df['code_snippet_length'] >= 5]
print(df.shape)

Comments with length < 5: Empty DataFrame
Columns: [Comments, Class]
Index: []
Code snippets with length < 5: Empty DataFrame
Columns: [Surrounding Code Context, Class]
Index: []
(11452, 7)


In [41]:
import re
def check_special_chars(text):
    return bool(re.search(r'[^\x00-\x7F]', str(text)))  # Detect non-ASCII characters
df['has_special_chars_comment'] = df['Comments'].apply(check_special_chars)
df['has_special_chars_code'] = df['Surrounding Code Context'].apply(check_special_chars)
print("Rows with special characters in comments:", df[df['has_special_chars_comment']].shape[0])
print("Rows with special characters in code:", df[df['has_special_chars_code']].shape[0])

Rows with special characters in comments: 0
Rows with special characters in code: 0


In [44]:
print("Class distribution:\n", df['Class'].value_counts(normalize=True))
df_no_duplicates = df.drop_duplicates(keep='first')
print("Class distribution:\n", df_no_duplicates['Class'].value_counts(normalize=True))

Class distribution:
 Useful        0.616748
Not Useful    0.383252
Name: Class, dtype: float64
Class distribution:
 Useful        0.64785
Not Useful    0.35215
Name: Class, dtype: float64


In [51]:
# Flag very short or repetitive comments
df['is_low_quality_comment'] = df['Comments'].str.strip().str.len() < 10  # Adjust threshold
print("Low-quality comments:", df[df['is_low_quality_comment']].shape[0])

Low-quality comments: 430


In [48]:
import re
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text, is_code=False):
    if not isinstance(text, str):
        return ''
    text = text.lower()  # Lowercase
    text = re.sub(r'\s+', ' ', text.strip())  # Normalize whitespace
    if not is_code:  # Skip for code to preserve structure
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove punctuation
        text = ' '.join(word for word in text.split() if word not in stop_words)  # Remove stopwords
    return text

df['clean_comment'] = df['Comments'].apply(clean_text)
df['clean_code_snippet'] = df['Surrounding Code Context'].apply(lambda x: clean_text(x, is_code=True))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sunda\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [49]:
df['comment_length'] = df['clean_comment'].str.split().str.len()  # Word count
df['code_snippet_length'] = df['clean_code_snippet'].str.split().str.len()
print("Comment length stats:\n", df['comment_length'].describe())
print("Code snippet length stats:\n", df['code_snippet_length'].describe())

# Flag outliers (e.g., beyond 95th percentile)
comment_outlier_threshold = df['comment_length'].quantile(0.95)
code_outlier_threshold = df['code_snippet_length'].quantile(0.95)
print("Comment outliers (>95th percentile):", df[df['comment_length'] > comment_outlier_threshold].shape[0])
print("Code snippet outliers (>95th percentile):", df[df['code_snippet_length'] > code_outlier_threshold].shape[0])

Comment length stats:
 count    11452.000000
mean         7.601467
std         13.083387
min          0.000000
25%          2.000000
50%          4.000000
75%          8.000000
max        316.000000
Name: comment_length, dtype: float64
Code snippet length stats:
 count    11452.000000
mean        31.674817
std         16.440361
min          1.000000
25%         24.000000
50%         31.000000
75%         38.000000
max        482.000000
Name: code_snippet_length, dtype: float64
Comment outliers (>95th percentile): 522
Code snippet outliers (>95th percentile): 490


In [45]:
# Check for label-related words in comments
leakage_words = ['useful', 'not useful', 'helpful', 'unhelpful']
df['has_leakage'] = df['Comments'].str.contains('|'.join(leakage_words), case=False, na=False)
print("Rows with potential leakage:", df[df['has_leakage']].shape[0])
print(df[df['has_leakage']][['Comments', 'Class']])

Rows with potential leakage: 25
                                                Comments   Class
5451   /*Otherwise, we need to consider hidden-line r...  Useful
5468   /*Otherwise, we need to consider hidden-line r...  Useful
6188   /*--------------------------------------------...  Useful
6190   /*--------------------------------------------...  Useful
6191   /*--------------------------------------------...  Useful
6192   /*--------------------------------------------...  Useful
6631   /*--------------------------------------------...  Useful
7400              /*Pointer to a useful terminal name.*/  Useful
7742              /*Pointer to a useful terminal name.*/  Useful
8131              /*Pointer to a useful terminal name.*/  Useful
9388   /*This does the same thing as the above howeve...  Useful
9395   /*Iterate through the usefully testable color ...  Useful
9438   /*There are two basic forms of standard images...  Useful
9439   /*Make a 'standard' palette.  Because there ar...  