Permalink
Browse files

Implement SSE2 version of loadAlphaData

http://codereview.appspot.com/6050054/
Signed-off-by: Nicolas Capens
Signed-off-by: Daniel Koch
Authored-by: Jin Yang

With this patch, my HTML5 2D canvas benchmark with Chromium on Windows7 with GPU
acceleration can boost about 4% though the most execution time is in GPU.

git-svn-id: http://angleproject.googlecode.com/svn/trunk@1067 736b8ea6-26fd-11df-bfd4-992fa37f6226
  • Loading branch information...
1 parent 084529c commit c8998e03ab91a4f40da7e3da4c453f2633765811 daniel@transgaming.com committed May 1, 2012
Showing with 53 additions and 3 deletions.
  1. +1 −0 CONTRIBUTORS
  2. +1 −1 src/common/version.h
  3. +49 −2 src/libGLESv2/Texture.cpp
  4. +2 −0 src/libGLESv2/Texture.h
View
@@ -48,4 +48,5 @@ Mark Callow
Yuriy O'Donnell
Sam Hocevar
Pierre Leveille
+Jin Yang
@@ -1,7 +1,7 @@
#define MAJOR_VERSION 1
#define MINOR_VERSION 0
#define BUILD_VERSION 0
-#define BUILD_REVISION 1064
+#define BUILD_REVISION 1067
#define STRINGIFY(x) #x
#define MACRO_STRINGIFY(x) STRINGIFY(x)
@@ -306,7 +306,14 @@ void Image::loadData(GLint xoffset, GLint yoffset, GLsizei width, GLsizei height
switch (mFormat)
{
case GL_ALPHA:
- loadAlphaData(width, height, inputPitch, input, locked.Pitch, locked.pBits);
+ if (supportsSSE2())
+ {
+ loadAlphaDataSSE2(width, height, inputPitch, input, locked.Pitch, locked.pBits);
+ }
+ else
+ {
+ loadAlphaData(width, height, inputPitch, input, locked.Pitch, locked.pBits);
+ }
break;
case GL_LUMINANCE:
loadLuminanceData(width, height, inputPitch, input, locked.Pitch, locked.pBits, getD3DFormat() == D3DFMT_L8);
@@ -430,6 +437,46 @@ void Image::loadAlphaData(GLsizei width, GLsizei height,
}
}
+void Image::loadAlphaDataSSE2(GLsizei width, GLsizei height,
+ int inputPitch, const void *input, size_t outputPitch, void *output) const
+{
+ const unsigned char *source = NULL;
+ unsigned int *dest = NULL;
+ __m128i zeroWide = _mm_setzero_si128();
+
+ for (int y = 0; y < height; y++)
+ {
+ source = static_cast<const unsigned char*>(input) + y * inputPitch;
+ dest = reinterpret_cast<unsigned int*>(static_cast<unsigned char*>(output) + y * outputPitch);
+
+ int x;
+ // Make output writes aligned
+ for (x = 0; ((reinterpret_cast<intptr_t>(&dest[x]) & 0xF) != 0 && x < width); x++)
+ {
+ dest[x] = static_cast<unsigned int>(source[x]) << 24;
+ }
+
+ for (; x + 7 < width; x += 8)
+ {
+ __m128i sourceData = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&source[x]));
+ // Interleave each byte to 16bit, make the lower byte to zero
+ sourceData = _mm_unpacklo_epi8(zeroWide, sourceData);
+ // Interleave each 16bit to 32bit, make the lower 16bit to zero
+ __m128i lo = _mm_unpacklo_epi16(zeroWide, sourceData);
+ __m128i hi = _mm_unpackhi_epi16(zeroWide, sourceData);
+
+ _mm_store_si128(reinterpret_cast<__m128i*>(&dest[x]), lo);
+ _mm_store_si128(reinterpret_cast<__m128i*>(&dest[x + 4]), hi);
+ }
+
+ // Handle the remainder
+ for (; x < width; x++)
+ {
+ dest[x] = static_cast<unsigned int>(source[x]) << 24;
+ }
+ }
+}
+
void Image::loadAlphaFloatData(GLsizei width, GLsizei height,
int inputPitch, const void *input, size_t outputPitch, void *output) const
{
@@ -3066,4 +3113,4 @@ TextureStorage *TextureCubeMap::getStorage(bool renderTarget)
return mTexStorage;
}
-}
+}
@@ -71,6 +71,8 @@ class Image
void loadAlphaData(GLsizei width, GLsizei height,
int inputPitch, const void *input, size_t outputPitch, void *output) const;
+ void loadAlphaDataSSE2(GLsizei width, GLsizei height,
+ int inputPitch, const void *input, size_t outputPitch, void *output) const;
void loadAlphaFloatData(GLsizei width, GLsizei height,
int inputPitch, const void *input, size_t outputPitch, void *output) const;
void loadAlphaHalfFloatData(GLsizei width, GLsizei height,

0 comments on commit c8998e0

Please sign in to comment.